diff --git a/.github/ISSUE_TEMPLATE/---document-issue-.md b/.github/ISSUE_TEMPLATE/---document-issue-.md
index 7c464ac584bc87cb16a796bf41acdcd79b8bd6f0..ffc2fcd7817b64584637a646edf5907612a7bbaf 100644
--- a/.github/ISSUE_TEMPLATE/---document-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---document-issue-.md
@@ -56,4 +56,4 @@ For example: no sample code; The sample code is not helpful; The sample code not
 For example:Chinese API in this doc is inconsistent with English API, including params, description, sample code, formula, etc.
 
 #### Other
-For example: The doc link is broken; The doc page is missing; Dead link in docs.
\ No newline at end of file
+For example: The doc link is broken; The doc page is missing; Dead link in docs.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f24513d605c49b608cb32425a861448a3acd6c6a..f30671bd3a87e87732b3a047e91811452370e06e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,7 @@
 # limitations under the License
 
 cmake_minimum_required(VERSION 3.10)
+cmake_policy(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
@@ -21,9 +22,6 @@ include(system)
 
 project(paddle CXX C)
 
-include(init)
-include(generic)            # simplify cmake module
-
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
@@ -32,16 +30,23 @@ option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
+option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
+# to develop some acl related functionality on x86
+option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
+option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
+# Note(zhouwei): It use option above, so put here
+include(init)
+include(generic)            # simplify cmake module
+
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU  AND WITH_ASCEND)
+if (WITH_GPU AND WITH_ASCEND)
     message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
-# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
-if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
-    message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
-       "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/")
+if (WITH_GPU AND WITH_ROCM)
+    message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
 
 if(WITH_GPU AND NOT APPLE)
@@ -61,6 +66,10 @@ if(WITH_MUSL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()
 
+if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+endif()
+
 if(WIN32)
     option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
 
@@ -72,6 +81,13 @@ if(WIN32)
     set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
     set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")
 
+    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
+        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
+        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
+        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
+    endif()
+
     if (MSVC_STATIC_CRT)
         message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
         set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /MTd")
@@ -89,8 +105,8 @@ if(WIN32)
         endforeach(flag_var)
     endif()
 
-    # NOTE(Avin0323): Less parallel count result in faster compilation.
     math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
+
     # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
@@ -98,7 +114,10 @@ if(WIN32)
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
         string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling
+        if(NOT WITH_GPU)
+            set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+        endif()
     endforeach(flag_var)
     foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
         set(${flag_var} "${${flag_var}} /w")
@@ -116,6 +135,13 @@ if(WIN32)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
 
+    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
+        set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+        if(MSVC_STATIC_CRT)
+            set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
+        endif()
+    endforeach(flag_var)
+
     if (WITH_WIN_DUMP_DBG)
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
@@ -153,8 +179,6 @@ option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(ON_INFER         "Turn on inference optimization and inference-lib generation" OFF)
 ################################ Internal Configurations #######################################
-option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"         OFF)
-option(WITH_RCCL        "Compile PaddlePaddle with RCCL support"          OFF)
 option(WITH_NV_JETSON   "Compile PaddlePaddle with NV JETSON"             OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
@@ -165,14 +189,15 @@ option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
 option(WITH_XBYAK       "Compile with xbyak support"                    ON)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
-option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_PSCORE     "Compile with parameter server support"         ${WITH_DISTRIBUTE})
+option(WITH_HETERPS     "Compile with heterps"                          OFF})
 option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
 option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
 option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
+option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
 option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
@@ -180,6 +205,7 @@ option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
 option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
+option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -240,9 +266,6 @@ endif()
 
 if(WITH_BRPC_RDMA)
     message(STATUS "Use brpc with rdma.")
-    if(WITH_GRPC)
-        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
-    endif()
     if(NOT WITH_DISTRIBUTE)
         message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
     endif()
@@ -290,9 +313,9 @@ endif(WITH_ROCM)
 
 if (NOT WITH_ROCM AND WITH_RCCL)
     MESSAGE(WARNING
-        "Disable RCCL when compiling without GPU. Force WITH_RCCL=OFF.")
-    set(WITH_NCCL OFF CACHE STRING
-        "Disable RCCL when compiling without GPU" FORCE)
+        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+    set(WITH_RCCL OFF CACHE STRING
+        "Disable RCCL when compiling without ROCM" FORCE)
 endif()
 
 if(WITH_RCCL)
@@ -330,6 +353,11 @@ if (WITH_MIPS)
     add_definitions(-DPADDLE_WITH_MIPS)
 endif()
 
+if (WITH_HETERPS)
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+    endif()
+endif()
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
@@ -347,6 +375,13 @@ else()
     message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
 endif()
 
+if(WITH_STRIP)
+    find_program(STRIP_PATH strip)
+    if(NOT STRIP_PATH OR NOT LINUX)
+        set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE)
+    endif()
+endif()
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
     add_subdirectory(python)
diff --git a/README.md b/README.md
index e8a7013d0b4432bc871843b83cf19494ca870cbc..8b437e4115abe80073866f52f3d7e387e2a554d3 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
-﻿
-<p align="center">
+﻿<p align="center">
 <img align="center" src="doc/imgs/logo.png", width=1600>
 <p>
     
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 9c1bd52e7fb7dfad5f6dc36d850468bf69ee92cd..e7f125269be1f5e015c6cf015489c312538ca4ba 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -82,6 +82,10 @@ if(WITH_ASCEND)
     add_definitions(-DPADDLE_WITH_ASCEND)
 endif()
 
+if(WITH_ASCEND_CL)
+    add_definitions(-DPADDLE_WITH_ASCEND_CL)
+endif()
+
 if(WITH_XPU)
     message(STATUS "Compile with XPU!")
     add_definitions(-DPADDLE_WITH_XPU)
@@ -93,13 +97,18 @@ if(WITH_GPU)
 
     FIND_PACKAGE(CUDA REQUIRED)
 
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1)
+        message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile")
     endif()
 
     if(NOT CUDNN_FOUND)
         message(FATAL_ERROR "Paddle needs cudnn to compile")
     endif()
+
+    if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile")
+    endif()
+
     if(CUPTI_FOUND)
         include_directories(${CUPTI_INCLUDE_DIR})
         add_definitions(-DPADDLE_WITH_CUPTI)
@@ -164,10 +173,9 @@ if(WITH_PSCORE)
     add_definitions(-DPADDLE_WITH_PSCORE)
 endif()
 
-
-if(WITH_GRPC)
-    add_definitions(-DPADDLE_WITH_GRPC)
-endif(WITH_GRPC)
+if(WITH_HETERPS)
+    add_definitions(-DPADDLE_WITH_HETERPS)
+endif()
 
 if(WITH_BRPC_RDMA)
     add_definitions(-DPADDLE_WITH_BRPC_RDMA)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 2f4f5449f482d71a2a27957af4b5f17601ab634f..7f2addb02d36ddf85cd08542cc5baab31d495bc5 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -6,15 +6,9 @@ endif()
 if (WITH_NV_JETSON)
   add_definitions(-DWITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
-  set(paddle_known_gpu_archs7 "53")
-  set(paddle_known_gpu_archs8 "53 62")
-  set(paddle_known_gpu_archs9 "53 62")
   set(paddle_known_gpu_archs10 "53 62 72")
 else()
-  set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
-  set(paddle_known_gpu_archs7 "30 35 50 52")
-  set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
-  set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
   set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
 endif()
@@ -74,7 +68,7 @@ endfunction()
 #   select_nvcc_arch_flags(out_variable)
 function(select_nvcc_arch_flags out_variable)
   # List of arch names
-  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
+  set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual")
   set(archs_name_default "Auto")
   list(APPEND archs_names "Auto")
 
@@ -91,7 +85,7 @@ function(select_nvcc_arch_flags out_variable)
 
   if(${CUDA_ARCH_NAME} STREQUAL "Manual")
     set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
-    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    set(CUDA_ARCH_PTX ""                        CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
     mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
   else()
     unset(CUDA_ARCH_BIN CACHE)
@@ -108,6 +102,8 @@ function(select_nvcc_arch_flags out_variable)
     set(cuda_arch_bin "70")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
+    set(cuda_arch_bin "80")
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
@@ -158,31 +154,21 @@ function(select_nvcc_arch_flags out_variable)
 endfunction()
 
 message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION})
-if (${CMAKE_CUDA_COMPILER_VERSION} LESS 7.0)
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 8.0) # CUDA 7.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 9.0) # CUDA 8.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
-  # warning for now.
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs11})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
+elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+
+  set(paddle_known_gpu_archs "${paddle_known_gpu_archs11} 86")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
 endif()
 
 if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
@@ -198,14 +184,11 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}")
 message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")
 
-# Set C++11 support
+# Set C++14 support
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-if (NOT WIN32) # windows msvc2015 support c++11 natively.
-    # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake.
-  set(CMAKE_CUDA_STANDARD 11)
-endif(NOT WIN32)
+set(CMAKE_CUDA_STANDARD 14)
 
 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index d8d8f634e76b6bf05d4936921ce37c889a4bdc7c..c82847100abefa6fcbaf1367965699413aadcadb 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file)
                 "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
             message(STATUS "Current cuDNN header is ${cudnn_header_file} "
-              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ")
+              "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ")
         endif()
     endif()
 endmacro()
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index bcf0c0a0646fc386f41c4b1f35ba773d6a1adb6f..414b2a54be0342b3ef76d5e3a553577cb5f3e4be 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -12,50 +12,78 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
-
-SET(ASCEND_PROJECT       "extern_ascend")
-IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE)
-  SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}")
-SET(ASCEND_SOURCE_DIR    "${THIRD_PARTY_PATH}/ascend")
-SET(ASCEND_DOWNLOAD_DIR  "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}")
-SET(ASCEND_DST_DIR       "ascend")
-SET(ASCEND_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(ASCEND_INSTALL_DIR   ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR})
-SET(ASCEND_ROOT          ${ASCEND_INSTALL_DIR})
-SET(ASCEND_INC_DIR       ${ASCEND_ROOT}/include)
-SET(ASCEND_LIB_DIR       ${ASCEND_ROOT}/lib)
-SET(ASCEND_LIB           ${ASCEND_LIB_DIR}/libge_runner.so)
-SET(ASCEND_GRAPH_LIB           ${ASCEND_LIB_DIR}/libgraph.so)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${ASCEND_INC_DIR})
-FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(ASCEND)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n"
-  "        DESTINATION ${ASCEND_DST_DIR})\n")
-ExternalProject_Add(
-    ${ASCEND_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${ASCEND_SOURCE_DIR}
-    DOWNLOAD_DIR          ${ASCEND_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz
-                          && tar zxvf ${ASCEND_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT}
-)
-ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB})
-
-ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB})
-ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT})
 
+#NOTE: Logic is from
+# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
+if(DEFINED ENV{ASCEND_CUSTOM_PATH})
+    set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
+else()
+    set(ASCEND_DIR /usr/local/Ascend)
+endif()
+
+if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
+  # It means CANN 20.2 +
+  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+endif()
+
+
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+  set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+  set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+  set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+  set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+  set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+  set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+  set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+  set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+  set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+  set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+  set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+  set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+  set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+  set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+  set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+  set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+  INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+
+  ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
+
+  ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+
+  ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
+
+  add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
+endif()
+
+if(WITH_ASCEND_CL)
+  set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+
+  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
+  set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
+  set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
+  set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
+
+  message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
+  message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
+  INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
+  INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
+
+  ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
+
+  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
+  ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
+  add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
+
+endif()
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 0eb590c42d0cb73ccb252430bc3e27312b0bddf9..2d72b6eb56deaa2547051756afc075a100aeb251 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -39,9 +39,9 @@ set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/
 ExternalProject_Add(
         extern_brpc
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        # TODO(gongwb): change to de newst repo when they changed.
+        # TODO(gongwb): change to de newst repo when they changed
         GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
-        GIT_TAG         "6d79e0b17f25107c35b705ea58d888083f59ff47"
+        GIT_TAG         "e203afb794caf027da0f1e0776443e7d20c0c28e"
         PREFIX          ${BRPC_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 5a755a816c332a2517ed61caa94d647afd557aae..aa471002eacb6a61a9cf835f293a86a75d87db8f 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -14,11 +14,11 @@
 
 include(ExternalProject)
 
-# update eigen to the commit id 4da2c6b1 on 03/19/2020
+# update eigen to the commit id f612df27 on 03/16/2021
 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3)
 set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3)
 set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git)
-set(EIGEN_TAG        4da2c6b1974827b1999bab652a3d4703e1992d26)
+set(EIGEN_TAG        f612df273689a19d25b45ca4f8269463207c4fee)
 
 cache_third_party(extern_eigen3
     REPOSITORY    ${EIGEN_REPOSITORY}
@@ -27,47 +27,15 @@ cache_third_party(extern_eigen3
 
 if(WIN32)
     add_definitions(-DEIGEN_STRONG_INLINE=inline)
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst)
-    # For Windows
-    # which will cause a compilation error in Tensor:74:
-    # "can not open file 'unistd.h'"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2)
-    # For VS2015
-    # which will cause a compilation error in TensorBlock.h:1028:
-    # "syntax error"
-    # so use following patch to solve compilation error On Windows.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3)
-    set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y)
 elseif(LINUX)
-    # For gxx=4.8, __GXX_ABI_VERSION is less than 1004
-    # which will cause a compilation error in Geometry_SSE.h:38:
-    # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)"
-    # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60
-    # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8
-    # so use following patch to solve compilation error with different version of gcc.
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1)
-    # The compiler fully support const expressions since c++14,
-    # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11
-    # add patch to avoid compilation error in c++11
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2)
-    file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2)
     if(WITH_ROCM)
         # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
         # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3)
-        # For HIPCC Eigen::internal::scalar_sum_op<bool,bool> is not EIGEN_DEVICE_FUNC
-        # which will cause compiler error of using __host__ funciont in __host__ __device__
-        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4)
-        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4)
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4})
-    else()
-        set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2})
+        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src)
+        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst)
+        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1)
+        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1)
+        set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1})
     endif()
 endif()
 
@@ -82,7 +50,7 @@ ExternalProject_Add(
     PREFIX          ${EIGEN_PREFIX_DIR}
     SOURCE_DIR      ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND    ""
-    PATCH_COMMAND   ${EIGEN_PATCH_COMMAND}
+    PATCH_COMMAND     ${EIGEN_PATCH_COMMAND}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
     INSTALL_COMMAND   ""
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index ea7af315e1a690578bd16c89cc83a158dacca4cf..e8db13a694f5578e314dc1a7c95ed24ad88bad02 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -32,21 +32,39 @@ cache_third_party(extern_gloo
     TAG           ${GLOO_TAG}
     DIR           GLOO_SOURCE_DIR)
 
-ExternalProject_Add(
-    extern_gloo
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    "${GLOO_DOWNLOAD_CMD}"
-    PREFIX                "${GLOO_PREFIX_DIR}"
-    SOURCE_DIR            "${GLOO_SOURCE_DIR}"
-    UPDATE_COMMAND        ""
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
-        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
-        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
-    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
-)
+  if(WITH_ASCEND OR WITH_ASCEND_CL)
+  ExternalProject_Add(
+      extern_gloo
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      "${GLOO_DOWNLOAD_CMD}"
+      PREFIX                "${GLOO_PREFIX_DIR}"
+      SOURCE_DIR            "${GLOO_SOURCE_DIR}"
+      UPDATE_COMMAND        ""
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+          && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
+          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
+  )
+else()
+  ExternalProject_Add(
+      extern_gloo
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      "${GLOO_DOWNLOAD_CMD}"
+      PREFIX                "${GLOO_PREFIX_DIR}"
+      SOURCE_DIR            "${GLOO_SOURCE_DIR}"
+      UPDATE_COMMAND        ""
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+          && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
+  )
+endif()
 
 
 ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
deleted file mode 100644
index 536e95c1dc2a4fe6545bd5d3147631aa26cdda98..0000000000000000000000000000000000000000
--- a/cmake/external/grpc.cmake
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-include (ExternalProject)
-
-SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
-SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
-SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
-SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
-
-include(ProcessorCount)
-ProcessorCount(NUM_OF_PROCESSOR)
-
-IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
-  SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install) 
-ELSE()
-  SET(GRPC_CFLAGS "-Wno-error -std=c11 ${CLFAGS}")
-  SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS}")
-  SET(BUILD_CMD make CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS} HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) 
-  SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS})
-ENDIF()
-
-# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them
-ExternalProject_Add(
-    extern_grpc
-    DEPENDS protobuf zlib
-    # NOTE(wuyi):
-    # this package is generated by following steps:
-    # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git
-    # 2. git submodule update --init
-    # 3. keep only zlib, cares, protobuf, boringssl under "third_party",
-    #    checkout and clean other dirs under third_party
-    # 4. remove .git, and package the directory.
-    URL          http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x_paddle.tar.gz
-    URL_MD5      f5442d137ddccee252e194b1bc90f98c
-    PREFIX          ${GRPC_SOURCES_DIR}
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_IN_SOURCE 1
-    # NOTE(yuyang18):
-    # Disable -Werror, otherwise the compile will fail in MacOS.
-    # It seems that we cannot configure that by make command.
-    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
-    BUILD_COMMAND  ${BUILD_CMD}
-    INSTALL_COMMAND ${GRPC_INSTALL_CMD}
-)
-
-ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
-             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
-
-ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
-ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
-
-ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
-SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
-            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
-
-include_directories(${GRPC_INCLUDE_DIR})
-ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 884219d8dd81f30e17f7a86380947262014e402a..fb1d4d9d56dcc6f38a86242b4d78b88ef31ddaa0 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            72efa005effb49595933e033cc732f215ef0445a)
+SET(MKLDNN_TAG            f58682cd8bd0615f41d879f8afc8f1511ab42d24)
 
 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 40a27f506f3077a5a47289d20906f7c180681b65..c108c05368c915f6d4998d46713cda315dfb93ff 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,8 +198,16 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+else()
     SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
     SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+endif()
 
     cache_third_party(${TARGET_NAME}
         REPOSITORY    ${PROTOBUF_REPOSITORY}
@@ -234,7 +242,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1.0)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    SET(PROTOBUF_VERSION 3.8.0)
+else()
+    SET(PROTOBUF_VERSION 3.1.0)
+endif()
 
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 205e8d26d93ca1c25e5b59ecc3b063b4837db77b..f9cb3a9075a821025129c1f6acb479a4ad6ac95c 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -16,7 +16,11 @@ INCLUDE(ExternalProject)
 
 SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
+else()
+    SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+endif()
 SET(THREADPOOL_TAG        9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
 
 cache_third_party(extern_threadpool
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 0ee3e2116a94b68d528a475a453d1c31f0464cf4..c591a9391dfa5d3b5a452ffbb5a5d3199d387519 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -14,11 +14,17 @@
 
 INCLUDE(ExternalProject)
 
+IF(WITH_ROCM)
+    add_definitions(-DWARPCTC_WITH_HIP)
+ENDIF()
+
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+# in case of low internet speed  
+#set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)
+set(WARPCTC_TAG         c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
@@ -37,38 +43,92 @@ cache_third_party(extern_warpctc
     TAG          ${WARPCTC_TAG}
     DIR          WARPCTC_SOURCE_DIR)
 
-ExternalProject_Add(
-    extern_warpctc
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    "${WARPCTC_DOWNLOAD_CMD}"
-    PREFIX          ${WARPCTC_PREFIX_DIR}
-    SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
-    #UPDATE_COMMAND  ""
-    PATCH_COMMAND   ""
-    BUILD_ALWAYS    1
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-                    -DWITH_GPU=${WITH_GPU}
-                    -DWITH_OMP=${USE_OMP}
-                    -DWITH_TORCH=OFF
-                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-                    -DBUILD_SHARED=ON
-                    -DBUILD_TESTS=OFF
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    ExternalProject_Add(
+        extern_warpctc
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        ${SHALLOW_CLONE}
+        "${WARPCTC_DOWNLOAD_CMD}"
+        PREFIX          ${WARPCTC_PREFIX_DIR}
+        SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
+        #UPDATE_COMMAND  ""
+        PATCH_COMMAND   ""
+        BUILD_ALWAYS    1
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                        -DWITH_GPU=${WITH_GPU}
+                        -DWITH_ROCM=${WITH_ROCM}
+                        -DWITH_OMP=${USE_OMP}
+                        -DWITH_TORCH=OFF
+                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                        -DBUILD_SHARED=ON
+                        -DBUILD_TESTS=OFF
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    )
+else()
+    if(WIN32)
+        set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_C_FLAGS_RELEASE $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_CXX_FLAGS_RELEASE $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+        set(WARPCTC_CXX_FLAGS_DEBUG $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+    else()
+        set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+        set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+        set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+        set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+        set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+        set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+    endif()
+    ExternalProject_Add(
+        extern_warpctc
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        ${SHALLOW_CLONE}
+        "${WARPCTC_DOWNLOAD_CMD}"
+        PREFIX          ${WARPCTC_PREFIX_DIR}
+        SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
+        #UPDATE_COMMAND  ""
+        PATCH_COMMAND   ""
+        BUILD_ALWAYS    1
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
+                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                        -DWITH_GPU=${WITH_GPU}
+                        -DWITH_ROCM=${WITH_ROCM}
+                        -DWITH_OMP=${USE_OMP}
+                        -DWITH_TORCH=OFF
+                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                        -DBUILD_SHARED=ON
+                        -DBUILD_TESTS=OFF
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    )
+endif()
+
+
 IF(WIN32)
     SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
             CACHE FILEPATH "Warp-ctc Library" FORCE)
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index b5a3f0154745b9425c3dfc45a129117238fa80de..f846623602ed79a5bd84268436a59ede1957364b 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT)
   elseif(WITH_SUNWAY)
       SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
   else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE)
+      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_04_09.tar.gz" CACHE STRING "" FORCE)
   endif()
 
   SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index e110524dd1abb864649daf8bd763e69ae87c600d..a2ddad557c2956f7de21bceaf7a6699e8dfbed43 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -4,10 +4,10 @@ include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 include(CheckTypeSize)
 
-function(CheckCompilerCXX11Flag)
+function(CheckCompilerCXX14Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
+        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4)
+            message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.")
         elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2)
             message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2")
         endif()
@@ -20,23 +20,15 @@ function(CheckCompilerCXX11Flag)
                 message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
             endif()
         else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
+            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4)
+                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.")
             endif()
         endif()
     endif()
 endfunction()
 
-CheckCompilerCXX11Flag()
-if (WITH_GPU)
-    if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0)
-       set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
-    else()
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-    endif()
-else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-endif()
+CheckCompilerCXX14Flag()
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index ba86cfabdf173467973b9d4337e6edbbe84c5889..a5c74a46631e9d76fa78261f706a1853a80bab32 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -447,9 +447,20 @@ function(cc_test TARGET_NAME)
     cc_test_build(${TARGET_NAME}
 	    SRCS ${cc_test_SRCS}
 	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
+    # we dont test hcom op, because it need complex configuration
+    # with more than one machine
+    if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"             OR
+            "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test"        OR
+            "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+      cc_test_run(${TARGET_NAME}
+        COMMAND ${TARGET_NAME}
+        ARGS ${cc_test_ARGS})
+    endif()
   endif()
 endfunction(cc_test)
 
@@ -492,10 +503,8 @@ function(nv_library TARGET_NAME)
         message(FATAL "Please specify source file or library in nv_library.")
       endif()
     endif(nv_library_SRCS)
-    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
-      if(${MSVC_VERSION} LESS_EQUAL 1900)
-        set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
-      endif()
+    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
+      set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
     endif()
   endif()
 endfunction(nv_library)
@@ -512,7 +521,7 @@ function(nv_binary TARGET_NAME)
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
       common_link(${TARGET_NAME})
     endif()
-    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
       set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
     endif()
   endif()
@@ -539,7 +548,7 @@ function(nv_test TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
-    if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+    if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910))
       set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS})
     endif()
   endif()
@@ -809,7 +818,7 @@ function(py_test TARGET_NAME)
                ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
-    
+
     if (WIN32)
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 2cba3d06936081097a773295c7f91e7aa53564a6..9694a7bc59c12a96e1c0c33488895ae94dbf2a03 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -192,6 +192,15 @@ include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
@@ -202,11 +211,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
   set(paddle_inference_c_lib $<TARGET_FILE_DIR:paddle_inference_c>/paddle_inference_c.*)
 else(WIN32)
-  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*)
+  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*)
 endif(WIN32)
 
 copy(inference_lib_dist
-      SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_inference_c_lib}
+      SRCS  ${src_dir}/inference/capi_exp/pd_*.h  ${paddle_inference_c_lib}
       DSTS  ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
 
 # fluid library for both train and inference
diff --git a/cmake/init.cmake b/cmake/init.cmake
index aea02088750df4edc71a4909489c8ba250c8bb64..b11156d2e9986f879dcf4dd63354edb81c493260 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,6 +18,10 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
+    # It can specify CUDA compile flag manualy,
+    # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
+    # because CUDA will update by nvidia, then error will occur.
+    # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
 
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 0343ff3cc292d97dcc77108735baa69c804468af..33390745cc8c96bc00b9eab84dfb637a8a76c2f9 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -11,6 +11,7 @@ function(op_library TARGET)
     set(cu_cc_srcs)
     set(hip_cc_srcs)
     set(xpu_cc_srcs)
+    set(npu_cc_srcs)
     set(cudnn_cu_cc_srcs)
     set(miopen_cu_cc_srcs)
     set(cudnn_cu_srcs)
@@ -20,6 +21,9 @@ function(op_library TARGET)
     set(mkldnn_cc_srcs)
     set(MKLDNN_FILE)
     set(op_common_deps operator op_registry math_function layer common_infer_shape_functions)
+    if (WITH_ASCEND_CL)
+      set(op_common_deps ${op_common_deps} npu_op_runner)
+    endif()
     # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
     set(options UNITY)
     set(oneValueArgs "")
@@ -40,6 +44,9 @@ function(op_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
                 list(APPEND cu_srcs ${TARGET}.cu)
             endif()
+            if (WITH_NV_JETSON)
+                list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu")
+            endif()
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
                 set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
                         ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE)
@@ -85,6 +92,12 @@ function(op_library TARGET)
                 list(APPEND xpu_cc_srcs ${XPU_FILE}.cc)
             endif()
         endif()
+        if(WITH_ASCEND_CL)
+            string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}")
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc)
+                list(APPEND npu_cc_srcs ${NPU_FILE}.cc)
+            endif()
+        endif()
     else()
         foreach(src ${op_library_SRCS})
             if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
@@ -107,6 +120,8 @@ function(op_library TARGET)
                 list(APPEND cu_cc_srcs ${src})
             elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
                 list(APPEND xpu_cc_srcs ${src})
+            elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
+                list(APPEND npu_cc_srcs ${src})
             elseif(${src} MATCHES ".*\\.cc$")
                 list(APPEND cc_srcs ${src})
             else()
@@ -168,15 +183,15 @@ function(op_library TARGET)
         list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
         list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
         list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
-        list(REMOVE_ITEM hip_srcs "correlation_op.cu")
         list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
+        list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
         # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`.
         if(WITH_UNITY_BUILD AND op_library_UNITY)
             # Combine the cc source files.
-            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs})
+            compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs})
             if(TARGET ${UNITY_TARGET})
                 # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`.
                 target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources})
@@ -187,7 +202,7 @@ function(op_library TARGET)
             # Add alias library to handle dependencies.
             add_library(${TARGET} ALIAS ${UNITY_TARGET})
         else()
-            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
+            cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
         endif()
     endif()
@@ -207,6 +222,7 @@ function(op_library TARGET)
     # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    set(ORIGINAL_TARGET ${TARGET})
     file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
     # [ \t\r\n]* is used for blank characters
@@ -239,8 +255,9 @@ function(op_library TARGET)
     list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
     list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
     list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
+    list(LENGTH npu_cc_srcs npu_cc_srcs_len)
     if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
-        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
+        ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0)
         file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
         set(pybind_flag 1)
     endif()
@@ -280,6 +297,26 @@ function(op_library TARGET)
     if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
     endif()
+
+    if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
+        file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
+        # It is different from the logic above, becareful
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}")
+        # [ \t\r\n]* is used for blank characters
+        string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}")
+
+        if (one_npu_register STREQUAL "")
+            string(REPLACE "_op" "" NPU_TARGET "${TARGET}")
+        else ()
+            string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}")
+            string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}")
+            # [ \t\r\n]+ is used for blank characters.
+            # Here we use '+' instead of '*' since it is a REPLACE operation.
+            string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}")
+        endif()
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
+    endif()
+
     # pybind USE_OP_DEVICE_KERNEL for MKLDNN
     if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
       # Append first implemented MKLDNN activation operator
@@ -330,6 +367,7 @@ function(register_operators)
     file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
     string(REPLACE "_mkldnn" "" OPS "${OPS}")
     string(REPLACE "_xpu" "" OPS "${OPS}")
+    string(REPLACE "_npu" "" OPS "${OPS}")
     string(REPLACE ".cc" "" OPS "${OPS}")
     list(REMOVE_DUPLICATES OPS)
     list(LENGTH register_operators_DEPS register_operators_DEPS_len)
diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props
index 0115ad4b59fc466ea10be6912257c40d31ed3640..3c069bd2981c437a1450ede29db2449dc46a9a4a 100644
--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
@@ -15,7 +15,7 @@
             <Warning>InheritFromHost</Warning>
 
             <BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
-            <BuildCommandLineTemplate>--use-local-env --cl-version $(CudaClVersion)</BuildCommandLineTemplate>
+            <BuildCommandLineTemplate>--use-local-env $(CudaClVersion)</BuildCommandLineTemplate>
             <BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
             <CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
             <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
@@ -88,4 +88,3 @@ set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
         </ClCompile>
     </ItemDefinitionGroup>
 </Project>
-
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 6488d29afc5f7f4af72aab1cf2463d900a89fa9d..56edaff2a50dab0f7029ec1e85fc3d4ce8ac416e 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -29,9 +29,9 @@ set(third_party_deps)
 # 2. REPOSITORY:    specify git REPOSITORY of 3rd party
 # 3. TAG:           specify git tag/branch/commitID of 3rd party
 # 4. DIR:           overwrite the original SOURCE_DIR when cache directory
-# 
+#
 # The function Return 1 PARENT_SCOPE variables:
-#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, 
+#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add,
 #                            and you no longer need to set any donwnload steps in ExternalProject_Add.
 # For example:
 #    Cache_third_party(${TARGET}
@@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET)
         SET(${TARGET_NAME}_DOWNLOAD_CMD
                 GIT_REPOSITORY  ${cache_third_party_REPOSITORY})
         IF(cache_third_party_TAG)
-            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD  
+            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD
                     GIT_TAG     ${cache_third_party_TAG})
         ENDIF()
     ELSEIF(cache_third_party_URL)
@@ -130,7 +130,7 @@ ENDFUNCTION()
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
     if(WITH_MKL)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
             "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.")
         set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE)
     endif()
@@ -141,7 +141,7 @@ if(WIN32 OR APPLE)
     SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
 
     if(WITH_LIBXSMM)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
             "Windows, Mac are not supported with libxsmm in Paddle yet."
             "Force WITH_LIBXSMM=OFF")
         SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE)
@@ -261,6 +261,14 @@ if(WITH_PSLIB)
     if(WITH_PSLIB_BRPC)
         include(external/pslib_brpc) # download, build, install pslib_brpc
         list(APPEND third_party_deps extern_pslib_brpc)
+    else()    
+        include(external/snappy)
+        list(APPEND third_party_deps extern_snappy)
+
+        include(external/leveldb)
+        list(APPEND third_party_deps extern_leveldb)
+        include(external/brpc)
+        list(APPEND third_party_deps extern_brpc)
     endif()
 endif(WITH_PSLIB)
 
@@ -274,10 +282,15 @@ if(WITH_BOX_PS)
     list(APPEND third_party_deps extern_box_ps)
 endif(WITH_BOX_PS)
 
-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
     include(external/ascend)
-    list(APPEND third_party_deps extern_ascend)
-endif (WITH_ASCEND)
+    if(WITH_ASCEND OR WITH_ASCEND_CL)
+        list(APPEND third_party_deps extern_ascend)
+    endif()
+    if(WITH_ASCEND_CL)
+        list(APPEND third_party_deps extern_ascend_cl)
+    endif()
+endif ()
 
 if (WITH_PSCORE)
     include(external/snappy)
@@ -285,7 +298,7 @@ if (WITH_PSCORE)
 
     include(external/leveldb)
     list(APPEND third_party_deps extern_leveldb)
-        
+
     include(external/brpc)
     list(APPEND third_party_deps extern_brpc)
 
diff --git a/go/README_cn.md b/go/README_cn.md
index a184ecbb8dea1ae71074ef9686d088a5f4cf0f33..040540e939bc3a0993e7c963b281ad91fbfe1ffc 100644
--- a/go/README_cn.md
+++ b/go/README_cn.md
@@ -50,6 +50,7 @@ output_data := value.Interface().([][]float32)
 
 运行
 ```bash
+go mod init github.com/paddlepaddle
 export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH
 go run ./demo/mobilenet.go
 ```
diff --git a/go/demo/mobilenet.go b/go/demo/mobilenet.go
index 1b42fe8049a584616da7b4940fd19a89df9bc52b..c1ca2e967f72dc6646a6785d86ba59c709bfe25c 100644
--- a/go/demo/mobilenet.go
+++ b/go/demo/mobilenet.go
@@ -13,7 +13,7 @@
 // limitations under the License.
 package main
 
-import "../paddle"
+import "github.com/paddlepaddle/paddle"
 import "strings"
 import "io/ioutil"
 import "strconv"
diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4f42dab6790bfb6dd33860a8ada704166bb74ac
--- /dev/null
+++ b/go/demo/mobilenet_c_exp.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <pd_inference_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void ReadData(float* data, int size);
+
+int main(int argc, char* argv[]) {
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__");
+  PD_ConfigDisableGlogInfo(config);
+
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  // config has destroyed in PD_PredictorCreate
+  config = NULL;
+
+  int input_num = PD_PredictorGetInputNum(predictor);
+  printf("Input num: %d\n", input_num);
+  int output_num = PD_PredictorGetOutputNum(predictor);
+  printf("Output num: %d\n", output_num);
+
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* input_tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  input_names = NULL;
+
+  int32_t shape[] = {1, 3, 300, 300};
+  float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300);  // NOLINT
+  ReadData(data, 1 * 3 * 300 * 300);                                // NOLINT
+  PD_TensorReshape(input_tensor, 4, shape);
+  PD_TensorCopyFromCpuFloat(input_tensor, data);
+  free(data);
+  data = NULL;
+  PD_PredictorRun(predictor);
+
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_OneDimArrayCstrDestroy(output_names);
+  output_names = nullptr;
+
+  PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor);
+  int32_t size = 1;
+  for (size_t index = 0; index < out_shape->size; ++index) {
+    size = size * out_shape->data[index];
+  }
+  PD_OneDimArrayInt32Destroy(out_shape);
+  out_shape = NULL;
+
+  data = (float*)malloc(sizeof(float) * size);  // NOLINT
+  PD_TensorCopyToCpuFloat(output_tensor, data);
+  free(data);
+  data = NULL;
+
+  PD_TensorDestroy(output_tensor);
+  output_tensor = NULL;
+  PD_TensorDestroy(input_tensor);
+  input_tensor = NULL;
+  PD_PredictorDestroy(predictor);
+  predictor = NULL;
+
+  return 0;
+}
+
+void ReadData(float* data, int n) {
+  FILE* fp = fopen("data/data.txt", "r");
+  for (int i = 0; i < n; i++) {
+    fscanf(fp, "%f", &data[i]);
+  }
+  fclose(fp);
+}
diff --git a/go/paddle/common.go b/go/paddle/common.go
index 4bf947659312824216e6003cb2f150ae39a94d00..cbbde6a45f59b80931a3a2c501581819085e8ea7 100644
--- a/go/paddle/common.go
+++ b/go/paddle/common.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <paddle_c_api.h>
 import "C"
diff --git a/go/paddle/config.go b/go/paddle/config.go
index 89f7d7e63ff2a858f058ad22ea424b29f66a4477..68a31230997bed73fbab1c1d1c7af123e353cf97 100644
--- a/go/paddle/config.go
+++ b/go/paddle/config.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <paddle_c_api.h>
diff --git a/go/paddle/predictor.go b/go/paddle/predictor.go
index 59bad908e6a5082e38b8bb33c849aa1097107d76..5f2b2c81a60549dfdbf22dd31a98560e7e3a8cee 100644
--- a/go/paddle/predictor.go
+++ b/go/paddle/predictor.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include "paddle_c_api.h"
 import "C"
@@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string {
 }
 
 func (predictor *Predictor) GetOutputNames() []string {
-	names := make([]string, predictor.GetInputNum())
+	names := make([]string, predictor.GetOutputNum())
 	for i := 0; i < len(names); i++ {
 		names[i] = predictor.GetOutputName(i)
 	}
diff --git a/go/paddle/tensor.go b/go/paddle/tensor.go
index e6e2c53fef1af565d4efba976d10839efe22517d..6fbcf039f88a7cc43a5d28f0433c9feb965566f0 100644
--- a/go/paddle/tensor.go
+++ b/go/paddle/tensor.go
@@ -15,7 +15,7 @@
 package paddle
 
 // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include
-// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c
+// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c
 // #include <stdbool.h>
 // #include <stdlib.h>
 // #include <string.h>
@@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va
 		value := reflect.Indirect(ptr)
 		value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0])))
 		if len(shape) == 1 && value.Len() > 0 {
-			switch value.Index(1).Kind() {
+			switch value.Index(0).Kind() {
 			case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32:
 				binary.Read(r, Endian(), value.Interface())
 				return
diff --git a/paddle/extension.h b/paddle/extension.h
index 71469576853a33b9158713304a68c6ac757aab4f..98d4bfd0326c5c524fcac9129f58d0ae99fc8afe 100644
--- a/paddle/extension.h
+++ b/paddle/extension.h
@@ -15,4 +15,4 @@ limitations under the License. */
 #pragma once
 
 // All paddle apis in C++ frontend
-#include "paddle/fluid/extension/include/ext_all.h"
+#include "paddle/extension/include/ext_all.h"
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index c18332d3b873164a725a25316fc611aa7e7a3092..dcff02a662e2734bc66d4cf219fce527fd0961aa 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,4 +9,3 @@ add_subdirectory(pybind)
 
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-add_subdirectory(train)
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 5a2d7a06201ba4acff679ffcfee87fde8d025ed6..905347d031b35b39b43879c7bd78ab39e933a5b3 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -11,9 +11,10 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
             "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
 endif()
 
-add_subdirectory(table)
 add_subdirectory(service)
+add_subdirectory(table)
 add_subdirectory(test)
+add_subdirectory(index_dataset)
 
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc
index b638af49730dd4800109729c9d91afa82efa80e4..dfd55f16e1a065e46b2186a6a589eabc1ac3b431 100644
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -177,8 +177,11 @@ std::future<int32_t> FleetWrapper::PullSparseVarsAsync(
   for (auto& t : *fea_values) {
     pull_result_ptr.push_back(t.data());
   }
-  return pserver_ptr_->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
+
+  bool training = true;
+  return pserver_ptr_->_worker_ptr->pull_sparse(pull_result_ptr.data(),
+                                                table_id, fea_keys->data(),
+                                                fea_keys->size(), training);
 }
 
 void FleetWrapper::PullSparseVarsSync(
@@ -224,8 +227,10 @@ void FleetWrapper::PullSparseVarsSync(
   for (auto& t : *fea_values) {
     pull_result_ptr.push_back(t.data());
   }
+  bool training = true;
   auto status = pserver_ptr_->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
+      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size(),
+      training);
   pull_sparse_status.push_back(std::move(status));
   for (auto& t : pull_sparse_status) {
     t.wait();
@@ -238,9 +243,13 @@ void FleetWrapper::PullSparseVarsSync(
   }
 }
 
+// is_training is true means training, false means inference, the behavior is
+// different on pserver
+
 void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
                                           uint64_t padding_id,
                                           platform::Place place,
+                                          bool is_training,
                                           std::vector<const LoDTensor*>* inputs,
                                           std::vector<LoDTensor*>* outputs) {
   std::vector<uint64_t> fea_keys;
@@ -279,7 +288,8 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
   }
   auto* communicator = Communicator::GetInstance();
   auto status = communicator->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size());
+      pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size(),
+      is_training);
   status.wait();
   auto ret = status.get();
   if (ret != 0) {
diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index ac566606ddcb4024eeaf7b846c894f7f5cdafa82..0da5d1e2bf987f38de3b9a03c659fc5e1841eca1 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -95,8 +95,12 @@ class FleetWrapper {
 
   // Pull sparse variables from server in sync mode
   // pull immediately to tensors
+  // is_training is true means training, false means inference, the behavior is
+  // different on pserver
+
   void PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
                               uint64_t padding_id, platform::Place place,
+                              bool is_training,
                               std::vector<const LoDTensor*>* inputs,  // NOLINT
                               std::vector<LoDTensor*>* outputs);      // NOLINT
 
diff --git a/paddle/fluid/distributed/index_dataset/CMakeLists.txt b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a30488494a52bcfea61476caeb1ab08e3e6781a1
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt
@@ -0,0 +1,7 @@
+proto_library(index_dataset_proto SRCS index_dataset.proto)
+cc_library(index_wrapper SRCS index_wrapper.cc DEPS index_dataset_proto fs)
+cc_library(index_sampler SRCS index_sampler.cc DEPS index_wrapper)
+
+if(WITH_PYTHON)
+  py_proto_compile(index_dataset_py_proto SRCS index_dataset.proto)
+endif()
diff --git a/paddle/fluid/distributed/index_dataset/index_dataset.proto b/paddle/fluid/distributed/index_dataset/index_dataset.proto
new file mode 100644
index 0000000000000000000000000000000000000000..1b4ee313671ad503b9e46dbe9e34d4a69d0cfc4d
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_dataset.proto
@@ -0,0 +1,32 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.distributed;
+
+message IndexNode {
+  required uint64 id = 1;
+  required bool is_leaf = 2;
+  required float probability = 3;
+}
+
+message TreeMeta {
+  required int32 height = 1;
+  required int32 branch = 2;
+}
+
+message KVItem {
+  required bytes key = 1;
+  required bytes value = 2;
+}
\ No newline at end of file
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3e573bbdd2de97130a109ddb583a724cf363c6be
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/index_dataset/index_sampler.h"
+
+namespace paddle {
+namespace distributed {
+
+std::vector<std::vector<uint64_t>> LayerWiseSampler::sample(
+    const std::vector<std::vector<uint64_t>>& user_inputs,
+    const std::vector<uint64_t>& target_ids, bool with_hierarchy) {
+  auto input_num = target_ids.size();
+  auto user_feature_num = user_inputs[0].size();
+  std::vector<std::vector<uint64_t>> outputs(
+      input_num * layer_counts_sum_,
+      std::vector<uint64_t>(user_feature_num + 2));
+
+  auto max_layer = tree_->Height();
+  size_t idx = 0;
+  for (size_t i = 0; i < input_num; i++) {
+    auto travel_codes =
+        tree_->GetTravelCodes(target_ids[i], start_sample_layer_);
+    auto travel_path = tree_->GetNodes(travel_codes);
+    for (size_t j = 0; j < travel_path.size(); j++) {
+      // user
+      if (j > 0 && with_hierarchy) {
+        auto ancestor_codes =
+            tree_->GetAncestorCodes(user_inputs[i], max_layer - j - 1);
+        auto hierarchical_user = tree_->GetNodes(ancestor_codes);
+        for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) {
+          for (size_t k = 0; k < user_feature_num; k++) {
+            outputs[idx + idx_offset][k] = hierarchical_user[k].id();
+          }
+        }
+      } else {
+        for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) {
+          for (size_t k = 0; k < user_feature_num; k++) {
+            outputs[idx + idx_offset][k] = user_inputs[i][k];
+          }
+        }
+      }
+
+      // sampler ++
+      outputs[idx][user_feature_num] = travel_path[j].id();
+      outputs[idx][user_feature_num + 1] = 1.0;
+      idx += 1;
+      for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) {
+        int sample_res = 0;
+        do {
+          sample_res = sampler_vec_[j]->Sample();
+        } while (layer_ids_[j][sample_res].id() == travel_path[j].id());
+        outputs[idx + idx_offset][user_feature_num] =
+            layer_ids_[j][sample_res].id();
+        outputs[idx + idx_offset][user_feature_num + 1] = 0;
+      }
+      idx += layer_counts_[j];
+    }
+  }
+  return outputs;
+}
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..8813421446a21c1379ca872952fe8b367d0724ca
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.h
@@ -0,0 +1,120 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/sampler.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class IndexSampler {
+ public:
+  virtual ~IndexSampler() {}
+  IndexSampler() {}
+
+  template <typename T>
+  static std::shared_ptr<IndexSampler> Init(const std::string& name) {
+    std::shared_ptr<IndexSampler> instance = nullptr;
+    instance.reset(new T(name));
+    return instance;
+  }
+
+  virtual void init_layerwise_conf(const std::vector<int>& layer_sample_counts,
+                                   int start_sample_layer = 1, int seed = 0) {}
+  virtual void init_beamsearch_conf(const int64_t k) {}
+  virtual std::vector<std::vector<uint64_t>> sample(
+      const std::vector<std::vector<uint64_t>>& user_inputs,
+      const std::vector<uint64_t>& input_targets,
+      bool with_hierarchy = false) = 0;
+};
+
+class LayerWiseSampler : public IndexSampler {
+ public:
+  virtual ~LayerWiseSampler() {}
+  explicit LayerWiseSampler(const std::string& name) {
+    tree_ = IndexWrapper::GetInstance()->get_tree_index(name);
+  }
+
+  void init_layerwise_conf(const std::vector<int>& layer_sample_counts,
+                           int start_sample_layer, int seed) override {
+    seed_ = seed;
+    start_sample_layer_ = start_sample_layer;
+
+    PADDLE_ENFORCE_GT(
+        start_sample_layer_, 0,
+        paddle::platform::errors::InvalidArgument(
+            "start sampler layer = [%d], it should greater than 0.",
+            start_sample_layer_));
+    PADDLE_ENFORCE_LT(start_sample_layer_, tree_->Height(),
+                      paddle::platform::errors::InvalidArgument(
+                          "start sampler layer = [%d], it should less than "
+                          "max_layer, which is [%d].",
+                          start_sample_layer_, tree_->Height()));
+
+    size_t i = 0;
+    layer_counts_sum_ = 0;
+    layer_counts_.clear();
+    int cur_layer = start_sample_layer_;
+    while (cur_layer < tree_->Height()) {
+      int layer_sample_num = 1;
+      if (i < layer_sample_counts.size()) {
+        layer_sample_num = layer_sample_counts[i];
+      }
+      layer_counts_sum_ += layer_sample_num + 1;
+      layer_counts_.push_back(layer_sample_num);
+      VLOG(3) << "[INFO] level " << cur_layer
+              << " sample_layer_counts.push_back: " << layer_sample_num;
+      cur_layer += 1;
+      i += 1;
+    }
+    reverse(layer_counts_.begin(), layer_counts_.end());
+    VLOG(3) << "sample counts sum: " << layer_counts_sum_;
+
+    auto max_layer = tree_->Height();
+    sampler_vec_.clear();
+    layer_ids_.clear();
+
+    auto layer_index = max_layer - 1;
+    size_t idx = 0;
+    while (layer_index >= start_sample_layer_) {
+      auto layer_codes = tree_->GetLayerCodes(layer_index);
+      layer_ids_.push_back(tree_->GetNodes(layer_codes));
+      auto sampler_temp =
+          std::make_shared<paddle::operators::math::UniformSampler>(
+              layer_ids_[idx].size() - 1, seed_);
+      sampler_vec_.push_back(sampler_temp);
+      layer_index--;
+      idx++;
+    }
+  }
+  std::vector<std::vector<uint64_t>> sample(
+      const std::vector<std::vector<uint64_t>>& user_inputs,
+      const std::vector<uint64_t>& target_ids, bool with_hierarchy) override;
+
+ private:
+  std::vector<int> layer_counts_;
+  int64_t layer_counts_sum_{0};
+  std::shared_ptr<TreeIndex> tree_{nullptr};
+  int seed_{0};
+  int start_sample_layer_{1};
+  std::vector<std::shared_ptr<paddle::operators::math::Sampler>> sampler_vec_;
+  std::vector<std::vector<IndexNode>> layer_ids_;
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..99fe4ca0c6d043caef01a867a5acc0d40841ee01
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/io/fs.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
+
+namespace paddle {
+namespace distributed {
+
+std::shared_ptr<IndexWrapper> IndexWrapper::s_instance_(nullptr);
+
+int TreeIndex::Load(const std::string filename) {
+  int err_no;
+  auto fp = paddle::framework::fs_open_read(filename, &err_no, "");
+  PADDLE_ENFORCE_NE(
+      fp, nullptr,
+      platform::errors::InvalidArgument(
+          "Open file %s failed. Please check whether the file exists.",
+          filename));
+
+  int num = 0;
+  max_id_ = 0;
+  fake_node_.set_id(0);
+  fake_node_.set_is_leaf(false);
+  fake_node_.set_probability(0.0);
+  max_code_ = 0;
+  size_t ret = fread(&num, sizeof(num), 1, fp.get());
+  while (ret == 1 && num > 0) {
+    std::string content(num, '\0');
+    size_t read_num =
+        fread(const_cast<char*>(content.data()), 1, num, fp.get());
+    PADDLE_ENFORCE_EQ(
+        read_num, static_cast<size_t>(num),
+        platform::errors::InvalidArgument(
+            "Read from file: %s failed. Valid Format is "
+            "an integer representing the length of the following string, "
+            "and the string itself.We got an iteger[% d], "
+            "but the following string's length is [%d].",
+            filename, num, read_num));
+
+    KVItem item;
+    PADDLE_ENFORCE_EQ(
+        item.ParseFromString(content), true,
+        platform::errors::InvalidArgument("Parse from file: %s failed. It's "
+                                          "content can't be parsed by KVItem.",
+                                          filename));
+
+    if (item.key() == ".tree_meta") {
+      meta_.ParseFromString(item.value());
+    } else {
+      auto code = boost::lexical_cast<uint64_t>(item.key());
+      IndexNode node;
+      node.ParseFromString(item.value());
+      PADDLE_ENFORCE_NE(node.id(), 0,
+                        platform::errors::InvalidArgument(
+                            "Node'id should not be equel to zero."));
+      if (node.is_leaf()) {
+        id_codes_map_[node.id()] = code;
+      }
+      data_[code] = node;
+      if (node.id() > max_id_) {
+        max_id_ = node.id();
+      }
+      if (code > max_code_) {
+        max_code_ = code;
+      }
+    }
+    ret = fread(&num, sizeof(num), 1, fp.get());
+  }
+  total_nodes_num_ = data_.size();
+  max_code_ += 1;
+  return 0;
+}
+
+std::vector<IndexNode> TreeIndex::GetNodes(const std::vector<uint64_t>& codes) {
+  std::vector<IndexNode> nodes;
+  nodes.reserve(codes.size());
+  for (size_t i = 0; i < codes.size(); i++) {
+    if (CheckIsValid(codes[i])) {
+      nodes.push_back(data_.at(codes[i]));
+    } else {
+      nodes.push_back(fake_node_);
+    }
+  }
+  return nodes;
+}
+
+std::vector<uint64_t> TreeIndex::GetLayerCodes(int level) {
+  uint64_t level_num = static_cast<uint64_t>(std::pow(meta_.branch(), level));
+  uint64_t level_offset = level_num - 1;
+
+  std::vector<uint64_t> res;
+  res.reserve(level_num);
+  for (uint64_t i = 0; i < level_num; i++) {
+    auto code = level_offset + i;
+    if (CheckIsValid(code)) {
+      res.push_back(code);
+    }
+  }
+  return res;
+}
+
+std::vector<uint64_t> TreeIndex::GetAncestorCodes(
+    const std::vector<uint64_t>& ids, int level) {
+  std::vector<uint64_t> res;
+  res.reserve(ids.size());
+
+  int cur_level;
+  for (size_t i = 0; i < ids.size(); i++) {
+    if (id_codes_map_.find(ids[i]) == id_codes_map_.end()) {
+      res.push_back(max_code_);
+    } else {
+      auto code = id_codes_map_.at(ids[i]);
+      cur_level = meta_.height() - 1;
+
+      while (level >= 0 && cur_level > level) {
+        code = (code - 1) / meta_.branch();
+        cur_level--;
+      }
+      res.push_back(code);
+    }
+  }
+  return res;
+}
+
+std::vector<uint64_t> TreeIndex::GetChildrenCodes(uint64_t ancestor,
+                                                  int level) {
+  auto level_code_num = static_cast<uint64_t>(std::pow(meta_.branch(), level));
+  auto code_min = level_code_num - 1;
+  auto code_max = meta_.branch() * level_code_num - 1;
+
+  std::vector<uint64_t> parent;
+  parent.push_back(ancestor);
+  std::vector<uint64_t> res;
+  size_t p_idx = 0;
+  while (true) {
+    size_t p_size = parent.size();
+    for (; p_idx < p_size; p_idx++) {
+      for (int i = 0; i < meta_.branch(); i++) {
+        auto code = parent[p_idx] * meta_.branch() + i + 1;
+        if (data_.find(code) != data_.end()) parent.push_back(code);
+      }
+    }
+    if ((code_min <= parent[p_idx]) && (parent[p_idx] < code_max)) {
+      break;
+    }
+  }
+
+  return std::vector<uint64_t>(parent.begin() + p_idx, parent.end());
+}
+
+std::vector<uint64_t> TreeIndex::GetTravelCodes(uint64_t id, int start_level) {
+  std::vector<uint64_t> res;
+  PADDLE_ENFORCE_NE(id_codes_map_.find(id), id_codes_map_.end(),
+                    paddle::platform::errors::InvalidArgument(
+                        "id = %d doesn't exist in Tree.", id));
+  auto code = id_codes_map_.at(id);
+  int level = meta_.height() - 1;
+
+  while (level >= start_level) {
+    res.push_back(code);
+    code = (code - 1) / meta_.branch();
+    level--;
+  }
+  return res;
+}
+
+std::vector<IndexNode> TreeIndex::GetAllLeafs() {
+  std::vector<IndexNode> res;
+  res.reserve(id_codes_map_.size());
+  for (auto& ite : id_codes_map_) {
+    auto code = ite.second;
+    res.push_back(data_.at(code));
+  }
+  return res;
+}
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.h b/paddle/fluid/distributed/index_dataset/index_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fb8faf6c84a2d9e1a5e80179a113b8d1ef312c8
--- /dev/null
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+class Index {
+ public:
+  Index() {}
+  ~Index() {}
+};
+
+class TreeIndex : public Index {
+ public:
+  TreeIndex() {}
+  ~TreeIndex() {}
+
+  int Height() { return meta_.height(); }
+  int Branch() { return meta_.branch(); }
+  uint64_t TotalNodeNums() { return total_nodes_num_; }
+  uint64_t EmbSize() { return max_id_ + 1; }
+  int Load(const std::string path);
+
+  inline bool CheckIsValid(int code) {
+    if (data_.find(code) != data_.end()) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  std::vector<IndexNode> GetNodes(const std::vector<uint64_t>& codes);
+  std::vector<uint64_t> GetLayerCodes(int level);
+  std::vector<uint64_t> GetAncestorCodes(const std::vector<uint64_t>& ids,
+                                         int level);
+  std::vector<uint64_t> GetChildrenCodes(uint64_t ancestor, int level);
+  std::vector<uint64_t> GetTravelCodes(uint64_t id, int start_level);
+  std::vector<IndexNode> GetAllLeafs();
+
+  std::unordered_map<uint64_t, IndexNode> data_;
+  std::unordered_map<uint64_t, uint64_t> id_codes_map_;
+  uint64_t total_nodes_num_;
+  TreeMeta meta_;
+  uint64_t max_id_;
+  uint64_t max_code_;
+  IndexNode fake_node_;
+};
+
+using TreePtr = std::shared_ptr<TreeIndex>;
+
+class IndexWrapper {
+ public:
+  virtual ~IndexWrapper() {}
+  IndexWrapper() {}
+
+  void clear_tree() { tree_map.clear(); }
+
+  TreePtr get_tree_index(const std::string name) {
+    PADDLE_ENFORCE_NE(tree_map.find(name), tree_map.end(),
+                      paddle::platform::errors::InvalidArgument(
+                          "tree [%s] doesn't exist. Please insert it firstly "
+                          "by API[\' insert_tree_index \'].",
+                          name));
+    return tree_map[name];
+  }
+
+  void insert_tree_index(const std::string name, const std::string tree_path) {
+    if (tree_map.find(name) != tree_map.end()) {
+      VLOG(0) << "Tree " << name << " has already existed.";
+      return;
+    }
+    TreePtr tree = std::make_shared<TreeIndex>();
+    int ret = tree->Load(tree_path);
+    PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument(
+                                  "Load tree[%s] from path[%s] failed. Please "
+                                  "check whether the file exists.",
+                                  name, tree_path));
+    tree_map.insert(std::pair<std::string, TreePtr>{name, tree});
+  }
+
+  static std::shared_ptr<IndexWrapper> GetInstancePtr() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::distributed::IndexWrapper());
+    }
+    return s_instance_;
+  }
+
+  static IndexWrapper* GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::distributed::IndexWrapper());
+    }
+    return s_instance_.get();
+  }
+
+ private:
+  static std::shared_ptr<IndexWrapper> s_instance_;
+  std::unordered_map<std::string, TreePtr> tree_map;
+};
+
+}  // end namespace distributed
+}  // end namespace paddle
diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/service/CMakeLists.txt
index bb3f6f1174da9d49a8407ec8db16a5a2aa2a8336..d1f04e26ade7289bcb10988d02de01962a1889ab 100644
--- a/paddle/fluid/distributed/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/service/CMakeLists.txt
@@ -16,6 +16,7 @@ set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT
 set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
@@ -24,11 +25,13 @@ set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT
 set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
+set_source_files_properties(graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})
 
-cc_library(downpour_server SRCS brpc_ps_server.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS})
-cc_library(downpour_client SRCS brpc_ps_client.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS})
+cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
+cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc
+ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
 
 cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS})
 cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
@@ -38,3 +41,6 @@ cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RP
 
 cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
 cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
+
+set_source_files_properties(graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_py_service SRCS graph_py_service.cc DEPS ps_service)
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc
index 163526fe3b28c91f36e2670d1974b520ef3bf66a..a6ad9d08f52fda9bd79b1a1f0eebf1769c855eb3 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_client.cc
@@ -768,8 +768,8 @@ std::future<int32_t> BrpcPsClient::push_global_step(int table_id,
 
 std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
                                                size_t table_id,
-                                               const uint64_t *keys,
-                                               size_t num) {
+                                               const uint64_t *keys, size_t num,
+                                               bool is_training) {
   size_t request_call_num = _server_channels.size();
 
   auto shard_sorted_kvs = std::make_shared<
@@ -837,16 +837,27 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
     uint32_t kv_request_count = 0;
     size_t sorted_kv_size = sorted_kvs.size();
     auto &request_buffer = closure->cntl(i)->request_attachment();
+
+    request_buffer.append((void *)&is_training, sizeof(bool));
+    std::vector<uint32_t> keys_counter;
+    keys_counter.reserve(sorted_kv_size);
+
     for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) {
       ++kv_request_count;
+      uint32_t keys = 1;
       last_key = sorted_kvs[kv_idx].first;
       request_buffer.append((void *)&last_key, sizeof(uint64_t));
       while (kv_idx < sorted_kv_size - 1 &&
              last_key == sorted_kvs[kv_idx + 1].first) {
         ++kv_idx;
+        ++keys;
       }
+      keys_counter.push_back(keys);
     }
 
+    request_buffer.append((void *)keys_counter.data(),
+                          sizeof(uint32_t) * keys_counter.size());
+
     if (kv_request_count == 0) {
       closure->Run();
     } else {
@@ -869,8 +880,8 @@ std::future<int32_t> BrpcPsClient::send_client2client_msg(
   auto promise = std::make_shared<std::promise<int32_t>>();
   std::future<int> fut = promise->get_future();
   if (to_client_id >= _client_channels.size()) {
-    LOG(FATAL) << "to_client_id is out of range clients, which size is "
-               << _client_channels.size();
+    VLOG(0) << "to_client_id is out of range clients, which size is "
+            << _client_channels.size();
     promise->set_value(-1);
     return fut;
   }
@@ -956,7 +967,7 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
   }
 
   auto status = pull_sparse((float **)save_vec.data(), table_id,
-                            save_key.data(), save_key.size());
+                            save_key.data(), save_key.size(), true);
   status.wait();
 
   // create lod tensor
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.h b/paddle/fluid/distributed/service/brpc_ps_client.h
index 8f9d2653864d1c7fd1801632a6c84edb1bc04ccf..5192356e4b5e574de385478c57a7b7cedb49988a 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/service/brpc_ps_client.h
@@ -148,7 +148,8 @@ class BrpcPsClient : public PSClient {
 
   virtual std::future<int32_t> pull_sparse(float **select_values,
                                            size_t table_id,
-                                           const uint64_t *keys, size_t num);
+                                           const uint64_t *keys, size_t num,
+                                           bool is_training);
 
   virtual std::future<int32_t> print_table_stat(uint32_t table_id);
 
@@ -170,9 +171,22 @@ class BrpcPsClient : public PSClient {
   virtual int32_t recv_and_save_table(const uint64_t table_id,
                                       const std::string &path);
 
- private:
+ protected:
+  virtual size_t get_server_nums() { return _server_channels.size(); }
+  inline brpc::Channel *get_sparse_channel(size_t server_id) {
+    return _server_channels[server_id][0].get();
+  }
+  inline brpc::Channel *get_dense_channel(size_t server_id) {
+    return _server_channels[server_id][1].get();
+  }
+  inline brpc::Channel *get_cmd_channel(size_t server_id) {
+    return _server_channels[server_id][2].get();
+  }
   virtual int32_t initialize() override;
 
+ private:
+  // virtual int32_t initialize() override;
+
   inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
                                       uint32_t shard_num) {
     return dense_dim_total / shard_num + 1;
@@ -184,16 +198,6 @@ class BrpcPsClient : public PSClient {
   std::future<int32_t> send_save_cmd(uint32_t table_id, int cmd_id,
                                      const std::vector<std::string> &param);
 
-  inline brpc::Channel *get_sparse_channel(size_t server_id) {
-    return _server_channels[server_id][0].get();
-  }
-  inline brpc::Channel *get_dense_channel(size_t server_id) {
-    return _server_channels[server_id][1].get();
-  }
-  inline brpc::Channel *get_cmd_channel(size_t server_id) {
-    return _server_channels[server_id][2].get();
-  }
-
   bool _running = false;
   bool _flushing = false;
   std::atomic<uint32_t> _async_call_num;  //异步请求计数
@@ -220,8 +224,6 @@ class BrpcPsClient : public PSClient {
                                                  size_t num,
                                                  void *done) override;
 
-  virtual size_t get_server_nums() { return _server_channels.size(); }
-
  private:
   int32_t start_client_service();
 
diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc
index 8400e669182d670b892dc2eb55492a92ee919ae5..a1440260bf2e77093bb937e62b13b54ad06a3e64 100644
--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_server.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
 #include <thread>  // NOLINT
+#include "butil/object_pool.h"
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/table.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -60,7 +62,8 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) {
   std::unique_lock<std::mutex> lock(mutex_);
 
   std::string ip_port = ip + ":" + std::to_string(port);
-  VLOG(3) << "server of rank " << _rank << " starts at " << ip_port;
+  VLOG(0) << "running server with rank id: " << _rank
+          << ", endpoint: " << ip_port;
   brpc::ServerOptions options;
 
   int num_threads = std::thread::hardware_concurrency();
@@ -194,12 +197,13 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request,
     return 0;
   }
 
-  std::vector<float> res_data;
-  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
-  table->pull_dense(res_data.data(), num);
+  auto res_data = butil::get_object<std::vector<float>>();
+  res_data->resize(num * table->value_accesor()->select_size() / sizeof(float));
+  table->pull_dense(res_data->data(), num);
 
-  cntl->response_attachment().append((char *)res_data.data(),
-                                     res_data.size() * sizeof(float));
+  cntl->response_attachment().append((char *)(res_data->data()),
+                                     res_data->size() * sizeof(float));
+  butil::return_object(res_data);
 
   return 0;
 }
@@ -336,35 +340,42 @@ int32_t BrpcPsService::pull_sparse(Table *table,
                                    brpc::Controller *cntl) {
   platform::RecordEvent record_event("PsService->pull_sparse");
   CHECK_TABLE_EXIST(table, request, response)
-  thread_local std::string push_sparse_request_buffer;
+
   auto &req_io_buffer = cntl->request_attachment();
   auto req_buffer_size = req_io_buffer.size();
+
   if (req_buffer_size < 1) {
     set_response_code(response, -1, "req attachment is empty");
     return 0;
   }
+
   if (request.params_size() < 1) {
     set_response_code(response, -1,
                       "PsRequestMessage.params is requeired at "
                       "least 1 for num of sparse_key");
     return 0;
   }
+
   uint32_t num = *(uint32_t *)(request.params(0).c_str());
-  push_sparse_request_buffer.resize(0);
-  push_sparse_request_buffer.reserve(req_buffer_size);
-  const char *data = (const char *)cntl->request_attachment().fetch(
-      const_cast<char *>(push_sparse_request_buffer.data()), req_buffer_size);
-  /*
-  Attachment Content:
-  |---keysData---|
-  |---8*{num}B---|
-  */
-  const uint64_t *keys = (const uint64_t *)data;
-  std::vector<float> res_data;
-  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
-  table->pull_sparse(res_data.data(), keys, num);
-  cntl->response_attachment().append((char *)res_data.data(),
-                                     res_data.size() * sizeof(float));
+  auto dim = table->value_accesor()->select_dim();
+
+  thread_local std::string req_buffer;
+  req_buffer.reserve(req_buffer_size);
+
+  const void *data = cntl->request_attachment().fetch(
+      const_cast<char *>(req_buffer.data()), req_buffer_size);
+
+  auto value = PullSparseValue(num, dim);
+
+  value.DeserializeFromBytes(const_cast<void *>(data));
+
+  auto res_data = butil::get_object<std::vector<float>>();
+  res_data->resize(num * dim);
+  table->pull_sparse(res_data->data(), value);
+
+  cntl->response_attachment().append((char *)(res_data->data()),
+                                     res_data->size() * sizeof(float));
+  butil::return_object(res_data);
   return 0;
 }
 
@@ -538,7 +549,7 @@ int32_t BrpcPsService::stop_server(Table *table,
   auto *p_server = _server;
   std::thread t_stop([p_server]() {
     p_server->stop();
-    LOG(INFO) << "Server Stoped";
+    VLOG(3) << "Server Stoped";
   });
   t_stop.detach();
   return 0;
diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
index 096718768149c574fd57b91396879d7bec5d37e0..a356b77e73733ed9b657a7603adf57c5228bf3c5 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -324,7 +324,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) {
 
   while (hp->h_addr_list[i] != NULL) {
     int_ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
-    VLOG(0) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip;
+    VLOG(3) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip;
     break;
   }
 
diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc
index 8699719e5cdcc8f40cf26fc90c17ad52849804d3..3d5ab8e16d90202d2365c14f764f5e0f53929b68 100644
--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/service/communicator.cc
@@ -320,9 +320,11 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id,
     push_g_vec.push_back(tensor->data<float>() + i * dim);
   }
 
+  bool training = true;
+
   auto status = _worker_ptr->pull_sparse(
       (float **)push_g_vec.data(), table_id,  // NOLINT
-      sparse_push_keys.data(), sparse_push_keys.size());
+      sparse_push_keys.data(), sparse_push_keys.size(), training);
   status.wait();
   return;
 }
diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h
index 043fe9d83dfc53aaa5d13ef1f12745836129aaa0..fa60cab2b58779ede16cc51971277130bcaca909 100644
--- a/paddle/fluid/distributed/service/communicator.h
+++ b/paddle/fluid/distributed/service/communicator.h
@@ -310,6 +310,8 @@ class Communicator {
     return _worker_ptr;
   }
 
+  RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; }
+
   std::shared_ptr<PSClient> _worker_ptr;  // pointer to worker
 
  protected:
diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/service/env.h
index 901aba0ad90c49c7403862997830bed7e0950dc0..ca395a776afd4e2ee53e0aeaebb94494d4f4e6a6 100644
--- a/paddle/fluid/distributed/service/env.h
+++ b/paddle/fluid/distributed/service/env.h
@@ -39,7 +39,7 @@ struct PSHost {
 
   // |---ip---|---port---|--rank--|
   // |-32bit--|--20bit---|--12bit-|
-  // for pslib
+
   uint64_t serialize_to_uint64() {
     uint64_t host_label = 0;
     host_label = inet_addr(ip.c_str());
@@ -175,14 +175,12 @@ class PSEnvironment {
     host.ip = ip;
     host.port = port;
     host.rank = rank;
-    if (sign_set.count(rank) > 0) {
-      LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port
-                   << ", rank:" << host.rank
-                   << " already register, ignore register";
-    } else {
+
+    if (sign_set.count(rank) == 0) {
       host_list.push_back(host);
       sign_set.insert(rank);
     }
+
     return 0;
   }
 
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eafb4d596cc1671db26189b84ea9d0c0c31ea398
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -0,0 +1,332 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include <algorithm>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "Eigen/Dense"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+
+void GraphPsService_Stub::service(
+    ::google::protobuf::RpcController *controller,
+    const ::paddle::distributed::PsRequestMessage *request,
+    ::paddle::distributed::PsResponseMessage *response,
+    ::google::protobuf::Closure *done) {
+  if (graph_service != NULL && local_channel == channel()) {
+    // VLOG(0)<<"use local";
+    task_pool->enqueue([this, controller, request, response, done]() -> int {
+      this->graph_service->service(controller, request, response, done);
+      return 0;
+    });
+  } else {
+    // VLOG(0)<<"use server";
+    PsService_Stub::service(controller, request, response, done);
+  }
+}
+
+int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
+  int shard_num = get_shard_num();
+  int shard_per_server = shard_num % server_size == 0
+                             ? shard_num / server_size
+                             : shard_num / server_size + 1;
+  return id % shard_num / shard_per_server;
+}
+
+std::future<int32_t> GraphBrpcClient::get_node_feat(
+    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const std::vector<std::string> &feature_names,
+    std::vector<std::vector<std::string>> &res) {
+  std::vector<int> request2server;
+  std::vector<int> server2request(server_size, -1);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    if (server2request[server_index] == -1) {
+      server2request[server_index] = request2server.size();
+      request2server.push_back(server_index);
+    }
+  }
+  size_t request_call_num = request2server.size();
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    int request_idx = server2request[server_index];
+    node_id_buckets[request_idx].push_back(node_ids[query_idx]);
+    query_idx_buckets[request_idx].push_back(query_idx);
+  }
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num,
+      [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (int request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+            ++fail_num;
+          } else {
+            auto &res_io_buffer =
+                closure->cntl(request_idx)->response_attachment();
+            butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+            size_t bytes_size = io_buffer_itr.bytes_left();
+            std::unique_ptr<char[]> buffer_wrapper(new char[bytes_size]);
+            char *buffer = buffer_wrapper.get();
+            io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+
+            for (size_t feat_idx = 0; feat_idx < feature_names.size();
+                 ++feat_idx) {
+              for (size_t node_idx = 0;
+                   node_idx < query_idx_buckets.at(request_idx).size();
+                   ++node_idx) {
+                int query_idx = query_idx_buckets.at(request_idx).at(node_idx);
+                size_t feat_len = *(size_t *)(buffer);
+                buffer += sizeof(size_t);
+                auto feature = std::string(buffer, feat_len);
+                res[feat_idx][query_idx] = feature;
+                buffer += feat_len;
+              }
+            }
+          }
+          if (fail_num == request_call_num) {
+            ret = -1;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = request2server[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_GET_NODE_FEAT);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = node_id_buckets[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)node_id_buckets[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    std::string joint_feature_name =
+        paddle::string::join_strings(feature_names, '\t');
+    closure->request(request_idx)
+        ->add_params(joint_feature_name.c_str(), joint_feature_name.size());
+
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+
+  return fut;
+}
+// char* &buffer,int &actual_size
+std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
+    uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
+    std::vector<std::vector<std::pair<uint64_t, float>>> &res) {
+  std::vector<int> request2server;
+  std::vector<int> server2request(server_size, -1);
+  res.clear();
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    if (server2request[server_index] == -1) {
+      server2request[server_index] = request2server.size();
+      request2server.push_back(server_index);
+    }
+    res.push_back(std::vector<std::pair<uint64_t, float>>());
+  }
+  size_t request_call_num = request2server.size();
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    int request_idx = server2request[server_index];
+    node_id_buckets[request_idx].push_back(node_ids[query_idx]);
+    query_idx_buckets[request_idx].push_back(query_idx);
+  }
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num,
+      [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (int request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+            ++fail_num;
+          } else {
+            auto &res_io_buffer =
+                closure->cntl(request_idx)->response_attachment();
+            butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+            size_t bytes_size = io_buffer_itr.bytes_left();
+            std::unique_ptr<char[]> buffer_wrapper(new char[bytes_size]);
+            char *buffer = buffer_wrapper.get();
+            io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+
+            size_t node_num = *(size_t *)buffer;
+            int *actual_sizes = (int *)(buffer + sizeof(size_t));
+            char *node_buffer =
+                buffer + sizeof(size_t) + sizeof(int) * node_num;
+
+            int offset = 0;
+            for (size_t node_idx = 0; node_idx < node_num; ++node_idx) {
+              int query_idx = query_idx_buckets.at(request_idx).at(node_idx);
+              int actual_size = actual_sizes[node_idx];
+              int start = 0;
+              while (start < actual_size) {
+                res[query_idx].push_back(
+                    {*(uint64_t *)(node_buffer + offset + start),
+                     *(float *)(node_buffer + offset + start +
+                                GraphNode::id_size)});
+                start += GraphNode::id_size + GraphNode::weight_size;
+              }
+              offset += actual_size;
+            }
+          }
+          if (fail_num == request_call_num) {
+            ret = -1;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = request2server[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = node_id_buckets[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)node_id_buckets[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    closure->request(request_idx)
+        ->add_params((char *)&sample_size, sizeof(int));
+    // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::random_sample_nodes(
+    uint32_t table_id, int server_index, int sample_size,
+    std::vector<uint64_t> &ids) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
+    int ret = 0;
+    auto *closure = (DownpourBrpcClosure *)done;
+    if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES) != 0) {
+      ret = -1;
+    } else {
+      auto &res_io_buffer = closure->cntl(0)->response_attachment();
+      butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+      size_t bytes_size = io_buffer_itr.bytes_left();
+      char buffer[bytes_size];
+      auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+      int index = 0;
+      while (index < bytes_size) {
+        ids.push_back(*(uint64_t *)(buffer + index));
+        index += GraphNode::id_size;
+      }
+    }
+    closure->set_promise_value(ret);
+  });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  ;
+  closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES);
+  closure->request(0)->set_table_id(table_id);
+  closure->request(0)->set_client_id(_client_id);
+  closure->request(0)->add_params((char *)&sample_size, sizeof(int));
+  ;
+  // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+  GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index));
+  closure->cntl(0)->set_log_id(butil::gettimeofday_ms());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::pull_graph_list(
+    uint32_t table_id, int server_index, int start, int size, int step,
+    std::vector<FeatureNode> &res) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
+    int ret = 0;
+    auto *closure = (DownpourBrpcClosure *)done;
+    if (closure->check_response(0, PS_PULL_GRAPH_LIST) != 0) {
+      ret = -1;
+    } else {
+      auto &res_io_buffer = closure->cntl(0)->response_attachment();
+      butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+      size_t bytes_size = io_buffer_itr.bytes_left();
+      char buffer[bytes_size];
+      io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+      int index = 0;
+      while (index < bytes_size) {
+        FeatureNode node;
+        node.recover_from_buffer(buffer + index);
+        index += node.get_size(false);
+        res.push_back(node);
+      }
+    }
+    closure->set_promise_value(ret);
+  });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  closure->request(0)->set_cmd_id(PS_PULL_GRAPH_LIST);
+  closure->request(0)->set_table_id(table_id);
+  closure->request(0)->set_client_id(_client_id);
+  closure->request(0)->add_params((char *)&start, sizeof(int));
+  closure->request(0)->add_params((char *)&size, sizeof(int));
+  closure->request(0)->add_params((char *)&step, sizeof(int));
+  // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+  GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index));
+  closure->cntl(0)->set_log_id(butil::gettimeofday_ms());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+int32_t GraphBrpcClient::initialize() {
+  // set_shard_num(_config.shard_num());
+  BrpcPsClient::initialize();
+  server_size = get_server_nums();
+  graph_service = NULL;
+  local_channel = NULL;
+  return 0;
+}
+}
+}
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e6775a4bedaf1a4028fe483f58be818ef1e3581
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_client.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <utility>
+#include "ThreadPool.h"
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace distributed {
+
+class GraphPsService_Stub : public PsService_Stub {
+ public:
+  GraphPsService_Stub(::google::protobuf::RpcChannel* channel,
+                      ::google::protobuf::RpcChannel* local_channel = NULL,
+                      GraphBrpcService* service = NULL, int thread_num = 1)
+      : PsService_Stub(channel) {
+    this->local_channel = local_channel;
+    this->graph_service = service;
+    task_pool.reset(new ::ThreadPool(thread_num));
+  }
+  virtual ~GraphPsService_Stub() {}
+
+  // implements PsService ------------------------------------------
+  GraphBrpcService* graph_service;
+  std::shared_ptr<::ThreadPool> task_pool;
+  ::google::protobuf::RpcChannel* local_channel;
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(GraphPsService_Stub);
+  void service(::google::protobuf::RpcController* controller,
+               const ::paddle::distributed::PsRequestMessage* request,
+               ::paddle::distributed::PsResponseMessage* response,
+               ::google::protobuf::Closure* done);
+};
+class GraphBrpcClient : public BrpcPsClient {
+ public:
+  GraphBrpcClient() {}
+  virtual ~GraphBrpcClient() {}
+  // given a batch of nodes, sample graph_neighboors for each of them
+  virtual std::future<int32_t> batch_sample_neighboors(
+      uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
+      std::vector<std::vector<std::pair<uint64_t, float>>>& res);
+
+  virtual std::future<int32_t> pull_graph_list(uint32_t table_id,
+                                               int server_index, int start,
+                                               int size, int step,
+                                               std::vector<FeatureNode>& res);
+  virtual std::future<int32_t> random_sample_nodes(uint32_t table_id,
+                                                   int server_index,
+                                                   int sample_size,
+                                                   std::vector<uint64_t>& ids);
+  virtual std::future<int32_t> get_node_feat(
+      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const std::vector<std::string>& feature_names,
+      std::vector<std::vector<std::string>>& res);
+  virtual int32_t initialize();
+  int get_shard_num() { return shard_num; }
+  void set_shard_num(int shard_num) { this->shard_num = shard_num; }
+  int get_server_index_by_id(uint64_t id);
+  void set_local_channel(int index) {
+    this->local_channel = get_cmd_channel(index);
+  }
+  void set_local_graph_service(GraphBrpcService* graph_service) {
+    this->graph_service = graph_service;
+  }
+  GraphPsService_Stub getServiceStub(::google::protobuf::RpcChannel* channel,
+                                     int thread_num = 1) {
+    return GraphPsService_Stub(channel, local_channel, graph_service,
+                               thread_num);
+  }
+
+ private:
+  int shard_num;
+  size_t server_size;
+  ::google::protobuf::RpcChannel* local_channel;
+  GraphBrpcService* graph_service;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdd926278b624b9e9bfdf19a4f293784bef6e28f
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_server.cc
@@ -0,0 +1,348 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+
+#include <thread>  // NOLINT
+#include "butil/endpoint.h"
+#include "iomanip"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+
+int32_t GraphBrpcServer::initialize() {
+  auto &service_config = _config.downpour_server_param().service_param();
+  if (!service_config.has_service_class()) {
+    LOG(ERROR) << "miss service_class in ServerServiceParameter";
+    return -1;
+  }
+  auto *service =
+      CREATE_PSCORE_CLASS(PsBaseService, service_config.service_class());
+  if (service == NULL) {
+    LOG(ERROR) << "service is unregistered, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+
+  _service.reset(service);
+  if (service->configure(this) != 0 || service->initialize() != 0) {
+    LOG(ERROR) << "service initialize failed, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+  if (_server.AddService(service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
+    LOG(ERROR) << "service add to brpc failed, service:"
+               << service_config.service_class();
+    return -1;
+  }
+  return 0;
+}
+
+uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) {
+  std::unique_lock<std::mutex> lock(mutex_);
+
+  std::string ip_port = ip + ":" + std::to_string(port);
+  VLOG(3) << "server of rank " << _rank << " starts at " << ip_port;
+  brpc::ServerOptions options;
+
+  int num_threads = std::thread::hardware_concurrency();
+  auto trainers = _environment->get_trainers();
+  options.num_threads = trainers > num_threads ? trainers : num_threads;
+
+  if (_server.Start(ip_port.c_str(), &options) != 0) {
+    LOG(ERROR) << "GraphBrpcServer start failed, ip_port=" << ip_port;
+    return 0;
+  }
+  _environment->registe_ps_server(ip, port, _rank);
+  return 0;
+}
+
+int32_t GraphBrpcServer::port() { return _server.listen_address().port; }
+
+int32_t GraphBrpcService::initialize() {
+  _is_initialize_shard_info = false;
+  _service_handler_map[PS_STOP_SERVER] = &GraphBrpcService::stop_server;
+  _service_handler_map[PS_LOAD_ONE_TABLE] = &GraphBrpcService::load_one_table;
+  _service_handler_map[PS_LOAD_ALL_TABLE] = &GraphBrpcService::load_all_table;
+
+  _service_handler_map[PS_PRINT_TABLE_STAT] =
+      &GraphBrpcService::print_table_stat;
+  _service_handler_map[PS_BARRIER] = &GraphBrpcService::barrier;
+  _service_handler_map[PS_START_PROFILER] = &GraphBrpcService::start_profiler;
+  _service_handler_map[PS_STOP_PROFILER] = &GraphBrpcService::stop_profiler;
+
+  _service_handler_map[PS_PULL_GRAPH_LIST] = &GraphBrpcService::pull_graph_list;
+  _service_handler_map[PS_GRAPH_SAMPLE_NEIGHBOORS] =
+      &GraphBrpcService::graph_random_sample_neighboors;
+  _service_handler_map[PS_GRAPH_SAMPLE_NODES] =
+      &GraphBrpcService::graph_random_sample_nodes;
+  _service_handler_map[PS_GRAPH_GET_NODE_FEAT] =
+      &GraphBrpcService::graph_get_node_feat;
+
+  // shard初始化,server启动后才可从env获取到server_list的shard信息
+  initialize_shard_info();
+
+  return 0;
+}
+
+#define CHECK_TABLE_EXIST(table, request, response)        \
+  if (table == NULL) {                                     \
+    std::string err_msg("table not found with table_id:"); \
+    err_msg.append(std::to_string(request.table_id()));    \
+    set_response_code(response, -1, err_msg.c_str());      \
+    return -1;                                             \
+  }
+
+int32_t GraphBrpcService::initialize_shard_info() {
+  if (!_is_initialize_shard_info) {
+    std::lock_guard<std::mutex> guard(_initialize_shard_mutex);
+    if (_is_initialize_shard_info) {
+      return 0;
+    }
+    size_t shard_num = _server->environment()->get_ps_servers().size();
+    auto &table_map = *(_server->table());
+    for (auto itr : table_map) {
+      itr.second->set_shard(_rank, shard_num);
+    }
+    _is_initialize_shard_info = true;
+  }
+  return 0;
+}
+
+void GraphBrpcService::service(google::protobuf::RpcController *cntl_base,
+                               const PsRequestMessage *request,
+                               PsResponseMessage *response,
+                               google::protobuf::Closure *done) {
+  brpc::ClosureGuard done_guard(done);
+  std::string log_label("ReceiveCmd-");
+  if (!request->has_table_id()) {
+    set_response_code(*response, -1, "PsRequestMessage.tabel_id is required");
+    return;
+  }
+
+  response->set_err_code(0);
+  response->set_err_msg("");
+  auto *table = _server->table(request->table_id());
+  brpc::Controller *cntl = static_cast<brpc::Controller *>(cntl_base);
+  auto itr = _service_handler_map.find(request->cmd_id());
+  if (itr == _service_handler_map.end()) {
+    std::string err_msg(
+        "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:");
+    err_msg.append(std::to_string(request->cmd_id()));
+    set_response_code(*response, -1, err_msg.c_str());
+    return;
+  }
+  serviceFunc handler_func = itr->second;
+  int service_ret = (this->*handler_func)(table, *request, *response, cntl);
+  if (service_ret != 0) {
+    response->set_err_code(service_ret);
+    response->set_err_msg("server internal error");
+  }
+}
+
+int32_t GraphBrpcService::barrier(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "PsRequestMessage.params is requeired at "
+                      "least 1 for num of sparse_key");
+    return 0;
+  }
+
+  auto trainer_id = request.client_id();
+  auto barrier_type = request.params(0);
+  table->barrier(trainer_id, barrier_type);
+  return 0;
+}
+
+int32_t GraphBrpcService::print_table_stat(Table *table,
+                                           const PsRequestMessage &request,
+                                           PsResponseMessage &response,
+                                           brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  std::pair<int64_t, int64_t> ret = table->print_table_stat();
+  paddle::framework::BinaryArchive ar;
+  ar << ret.first << ret.second;
+  std::string table_info(ar.Buffer(), ar.Length());
+  response.set_data(table_info);
+
+  return 0;
+}
+
+int32_t GraphBrpcService::load_one_table(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "PsRequestMessage.datas is requeired at least 2 for path & load_param");
+    return -1;
+  }
+  if (table->load(request.params(0), request.params(1)) != 0) {
+    set_response_code(response, -1, "table load failed");
+    return -1;
+  }
+  return 0;
+}
+
+int32_t GraphBrpcService::load_all_table(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  auto &table_map = *(_server->table());
+  for (auto &itr : table_map) {
+    if (load_one_table(itr.second.get(), request, response, cntl) != 0) {
+      LOG(ERROR) << "load table[" << itr.first << "] failed";
+      return -1;
+    }
+  }
+  return 0;
+}
+
+int32_t GraphBrpcService::stop_server(Table *table,
+                                      const PsRequestMessage &request,
+                                      PsResponseMessage &response,
+                                      brpc::Controller *cntl) {
+  GraphBrpcServer *p_server = (GraphBrpcServer *)_server;
+  std::thread t_stop([p_server]() {
+    p_server->stop();
+    LOG(INFO) << "Server Stoped";
+  });
+  p_server->export_cv()->notify_all();
+  t_stop.detach();
+  return 0;
+}
+
+int32_t GraphBrpcService::stop_profiler(Table *table,
+                                        const PsRequestMessage &request,
+                                        PsResponseMessage &response,
+                                        brpc::Controller *cntl) {
+  platform::DisableProfiler(platform::EventSortingKey::kDefault,
+                            string::Sprintf("server_%s_profile", _rank));
+  return 0;
+}
+
+int32_t GraphBrpcService::start_profiler(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  platform::EnableProfiler(platform::ProfilerState::kCPU);
+  return 0;
+}
+
+int32_t GraphBrpcService::pull_graph_list(Table *table,
+                                          const PsRequestMessage &request,
+                                          PsResponseMessage &response,
+                                          brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 3) {
+    set_response_code(response, -1,
+                      "pull_graph_list request requires at least 3 arguments");
+    return 0;
+  }
+  int start = *(int *)(request.params(0).c_str());
+  int size = *(int *)(request.params(1).c_str());
+  int step = *(int *)(request.params(2).c_str());
+  std::unique_ptr<char[]> buffer;
+  int actual_size;
+  ((GraphTable *)table)
+      ->pull_graph_list(start, size, buffer, actual_size, false, step);
+  cntl->response_attachment().append(buffer.get(), actual_size);
+  return 0;
+}
+int32_t GraphBrpcService::graph_random_sample_neighboors(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "graph_random_sample request requires at least 2 arguments");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  int sample_size = *(uint64_t *)(request.params(1).c_str());
+  std::vector<std::unique_ptr<char[]>> buffers(node_num);
+  std::vector<int> actual_sizes(node_num, 0);
+  ((GraphTable *)table)
+      ->random_sample_neighboors(node_data, sample_size, buffers, actual_sizes);
+
+  cntl->response_attachment().append(&node_num, sizeof(size_t));
+  cntl->response_attachment().append(actual_sizes.data(),
+                                     sizeof(int) * node_num);
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    cntl->response_attachment().append(buffers[idx].get(), actual_sizes[idx]);
+  }
+  return 0;
+}
+int32_t GraphBrpcService::graph_random_sample_nodes(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  size_t size = *(uint64_t *)(request.params(0).c_str());
+  std::unique_ptr<char[]> buffer;
+  int actual_size;
+  if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) ==
+      0) {
+    cntl->response_attachment().append(buffer.get(), actual_size);
+  } else
+    cntl->response_attachment().append(NULL, 0);
+
+  return 0;
+}
+
+int32_t GraphBrpcService::graph_get_node_feat(Table *table,
+                                              const PsRequestMessage &request,
+                                              PsResponseMessage &response,
+                                              brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "graph_get_node_feat request requires at least 2 arguments");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+
+  std::vector<std::string> feature_names =
+      paddle::string::split_string<std::string>(request.params(1), "\t");
+
+  std::vector<std::vector<std::string>> feature(
+      feature_names.size(), std::vector<std::string>(node_num));
+
+  ((GraphTable *)table)->get_node_feat(node_ids, feature_names, feature);
+
+  for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+    for (size_t node_idx = 0; node_idx < node_num; ++node_idx) {
+      size_t feat_len = feature[feat_idx][node_idx].size();
+      cntl->response_attachment().append(&feat_len, sizeof(size_t));
+      cntl->response_attachment().append(feature[feat_idx][node_idx].data(),
+                                         feat_len);
+    }
+  }
+
+  return 0;
+}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..32c572f9e6c2bf759c59190679bcf7570a807f2d
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_server.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+
+#include <memory>
+#include <vector>
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/server.h"
+#include "paddle/fluid/distributed/table/common_graph_table.h"
+#include "paddle/fluid/distributed/table/table.h"
+namespace paddle {
+namespace distributed {
+class GraphBrpcServer : public PSServer {
+ public:
+  GraphBrpcServer() {}
+  virtual ~GraphBrpcServer() {}
+  PsBaseService *get_service() { return _service.get(); }
+  virtual uint64_t start(const std::string &ip, uint32_t port);
+  virtual int32_t stop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (stoped_) return 0;
+    stoped_ = true;
+    // cv_.notify_all();
+    _server.Stop(1000);
+    _server.Join();
+    return 0;
+  }
+  virtual int32_t port();
+
+  std::condition_variable *export_cv() { return &cv_; }
+
+ private:
+  virtual int32_t initialize();
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  bool stoped_ = false;
+  brpc::Server _server;
+  std::shared_ptr<PsBaseService> _service;
+  std::vector<std::shared_ptr<brpc::Channel>> _pserver_channels;
+};
+
+class GraphBrpcService;
+
+typedef int32_t (GraphBrpcService::*serviceFunc)(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl);
+
+class GraphBrpcService : public PsBaseService {
+ public:
+  virtual int32_t initialize() override;
+
+  virtual void service(::google::protobuf::RpcController *controller,
+                       const PsRequestMessage *request,
+                       PsResponseMessage *response,
+                       ::google::protobuf::Closure *done) override;
+
+ protected:
+  std::unordered_map<int32_t, serviceFunc> _service_handler_map;
+  int32_t initialize_shard_info();
+  int32_t pull_graph_list(Table *table, const PsRequestMessage &request,
+                          PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t graph_random_sample_neighboors(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl);
+  int32_t graph_random_sample_nodes(Table *table,
+                                    const PsRequestMessage &request,
+                                    PsResponseMessage &response,
+                                    brpc::Controller *cntl);
+  int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request,
+                              PsResponseMessage &response,
+                              brpc::Controller *cntl);
+  int32_t barrier(Table *table, const PsRequestMessage &request,
+                  PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_one_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_all_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_server(Table *table, const PsRequestMessage &request,
+                      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t start_profiler(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_profiler(Table *table, const PsRequestMessage &request,
+                        PsResponseMessage &response, brpc::Controller *cntl);
+
+  int32_t print_table_stat(Table *table, const PsRequestMessage &request,
+                           PsResponseMessage &response, brpc::Controller *cntl);
+
+ private:
+  bool _is_initialize_shard_info;
+  std::mutex _initialize_shard_mutex;
+  std::unordered_map<int32_t, serviceHandlerFunc> _msg_handler_map;
+  std::vector<float> _ori_values;
+  const int sample_nodes_ranges = 23;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61e4e0cf7bb9155d25c630296c2b55a7d3400bfc
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_py_service.cc
@@ -0,0 +1,325 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include <thread>  // NOLINT
+#include "butil/endpoint.h"
+#include "iomanip"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+std::vector<std::string> GraphPyService::split(std::string& str,
+                                               const char pattern) {
+  std::vector<std::string> res;
+  std::stringstream input(str);
+  std::string temp;
+  while (std::getline(input, temp, pattern)) {
+    res.push_back(temp);
+  }
+  return res;
+}
+
+void GraphPyService::add_table_feat_conf(std::string table_name,
+                                         std::string feat_name,
+                                         std::string feat_dtype,
+                                         int32_t feat_shape) {
+  if (this->table_id_map.count(table_name)) {
+    this->table_feat_conf_table_name.push_back(table_name);
+    this->table_feat_conf_feat_name.push_back(feat_name);
+    this->table_feat_conf_feat_dtype.push_back(feat_dtype);
+    this->table_feat_conf_feat_shape.push_back(feat_shape);
+  }
+}
+
+void GraphPyService::set_up(std::string ips_str, int shard_num,
+                            std::vector<std::string> node_types,
+                            std::vector<std::string> edge_types) {
+  set_shard_num(shard_num);
+  set_num_node_types(node_types.size());
+
+  for (size_t table_id = 0; table_id < node_types.size(); table_id++) {
+    this->table_id_map[node_types[table_id]] = this->table_id_map.size();
+  }
+  for (size_t table_id = 0; table_id < edge_types.size(); table_id++) {
+    this->table_id_map[edge_types[table_id]] = this->table_id_map.size();
+  }
+  std::istringstream stream(ips_str);
+  std::string ip;
+  server_size = 0;
+  std::vector<std::string> ips_list = split(ips_str, ';');
+  int index = 0;
+  for (auto ips : ips_list) {
+    auto ip_and_port = split(ips, ':');
+    server_list.push_back(ip_and_port[0]);
+    port_list.push_back(ip_and_port[1]);
+    uint32_t port = stoul(ip_and_port[1]);
+    auto ph_host = paddle::distributed::PSHost(ip_and_port[0], port, index);
+    host_sign_list.push_back(ph_host.serialize_to_string());
+    index++;
+  }
+}
+void GraphPyClient::start_client() {
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list, servers_);
+  worker_ptr = std::shared_ptr<paddle::distributed::GraphBrpcClient>(
+      (paddle::distributed::GraphBrpcClient*)
+          paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr->configure(worker_proto, dense_regions, _ps_env, client_id);
+  worker_ptr->set_shard_num(get_shard_num());
+}
+void GraphPyServer::start_server(bool block) {
+  std::string ip = server_list[rank];
+  uint32_t port = std::stoul(port_list[rank]);
+  ::paddle::distributed::PSParameter server_proto = this->GetServerProto();
+
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&this->host_sign_list,
+                         this->host_sign_list.size());  // test
+  pserver_ptr = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto));
+  VLOG(0) << "pserver-ptr created ";
+  std::vector<framework::ProgramDesc> empty_vec;
+  framework::ProgramDesc empty_prog;
+  empty_vec.push_back(empty_prog);
+  pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec);
+  pserver_ptr->start(ip, port);
+  std::condition_variable* cv_ = pserver_ptr->export_cv();
+  if (block) {
+    std::mutex mutex_;
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_->wait(lock);
+  }
+}
+::paddle::distributed::PSParameter GraphPyServer::GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* sparse_table_proto =
+        downpour_server_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first,
+                                table_type, feat_name, feat_dtype, feat_shape);
+  }
+
+  return server_fleet_desc;
+}
+
+::paddle::distributed::PSParameter GraphPyClient::GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+        downpour_worker_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(worker_sparse_table_proto, tuple.second,
+                                tuple.first, table_type, feat_name, feat_dtype,
+                                feat_shape);
+  }
+
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* sparse_table_proto =
+        downpour_server_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first,
+                                table_type, feat_name, feat_dtype, feat_shape);
+  }
+
+  return worker_fleet_desc;
+}
+void GraphPyClient::load_edge_file(std::string name, std::string filepath,
+                                   bool reverse) {
+  // 'e' means load edge
+  std::string params = "e";
+  if (reverse) {
+    // 'e<' means load edges from $2 to $1
+    params += "<";
+  } else {
+    // 'e>' means load edges from $1 to $2
+    params += ">";
+  }
+  if (this->table_id_map.count(name)) {
+    VLOG(0) << "loadding data with type " << name << " from " << filepath;
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->load(table_id, std::string(filepath), params);
+    status.wait();
+  }
+}
+
+void GraphPyClient::load_node_file(std::string name, std::string filepath) {
+  // 'n' means load nodes and 'node_type' follows
+  std::string params = "n" + name;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->load(table_id, std::string(filepath), params);
+    status.wait();
+  }
+}
+std::vector<std::vector<std::pair<uint64_t, float>>>
+GraphPyClient::batch_sample_neighboors(std::string name,
+                                       std::vector<uint64_t> node_ids,
+                                       int sample_size) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> v;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        worker_ptr->batch_sample_neighboors(table_id, node_ids, sample_size, v);
+    status.wait();
+  }
+  return v;
+}
+
+std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
+                                                         int server_index,
+                                                         int sample_size) {
+  std::vector<uint64_t> v;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        worker_ptr->random_sample_nodes(table_id, server_index, sample_size, v);
+    status.wait();
+  }
+  return v;
+}
+
+// (name, dtype, ndarray)
+std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
+    std::string node_type, std::vector<uint64_t> node_ids,
+    std::vector<std::string> feature_names) {
+  std::vector<std::vector<std::string>> v(
+      feature_names.size(), std::vector<std::string>(node_ids.size()));
+  if (this->table_id_map.count(node_type)) {
+    uint32_t table_id = this->table_id_map[node_type];
+    auto status =
+        worker_ptr->get_node_feat(table_id, node_ids, feature_names, v);
+    status.wait();
+  }
+  return v;
+}
+
+std::vector<FeatureNode> GraphPyClient::pull_graph_list(std::string name,
+                                                        int server_index,
+                                                        int start, int size,
+                                                        int step) {
+  std::vector<FeatureNode> res;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status = worker_ptr->pull_graph_list(table_id, server_index, start,
+                                              size, step, res);
+    status.wait();
+  }
+  return res;
+}
+
+void GraphPyClient::stop_server() {
+  VLOG(0) << "going to stop server";
+  std::unique_lock<std::mutex> lock(mutex_);
+  if (stoped_) return;
+  auto status = this->worker_ptr->stop_server();
+  if (status.get() == 0) stoped_ = true;
+}
+void GraphPyClient::finalize_worker() { this->worker_ptr->finalize_worker(); }
+}
+}
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6657be96ba446d2f7538943aab43dd47e1868fb
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -0,0 +1,166 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_map>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+namespace paddle {
+namespace distributed {
+class GraphPyService {
+ protected:
+  std::vector<std::string> server_list, port_list, host_sign_list;
+  int server_size, shard_num;
+  int num_node_types;
+  std::unordered_map<std::string, uint32_t> table_id_map;
+  std::vector<std::string> table_feat_conf_table_name;
+  std::vector<std::string> table_feat_conf_feat_name;
+  std::vector<std::string> table_feat_conf_feat_dtype;
+  std::vector<int32_t> table_feat_conf_feat_shape;
+
+ public:
+  int get_shard_num() { return shard_num; }
+  void set_shard_num(int shard_num) { this->shard_num = shard_num; }
+  void GetDownpourSparseTableProto(
+      ::paddle::distributed::TableParameter* sparse_table_proto,
+      uint32_t table_id, std::string table_name, std::string table_type,
+      std::vector<std::string> feat_name, std::vector<std::string> feat_dtype,
+      std::vector<int32_t> feat_shape) {
+    sparse_table_proto->set_table_id(table_id);
+    sparse_table_proto->set_table_class("GraphTable");
+    sparse_table_proto->set_shard_num(shard_num);
+    sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+    ::paddle::distributed::TableAccessorParameter* accessor_proto =
+        sparse_table_proto->mutable_accessor();
+
+    ::paddle::distributed::CommonAccessorParameter* common_proto =
+        sparse_table_proto->mutable_common();
+
+    // Set GraphTable Parameter
+    common_proto->set_table_name(table_name);
+    common_proto->set_name(table_type);
+    for (size_t i = 0; i < feat_name.size(); i++) {
+      common_proto->add_params(feat_dtype[i]);
+      common_proto->add_dims(feat_shape[i]);
+      common_proto->add_attributes(feat_name[i]);
+    }
+
+    accessor_proto->set_accessor_class("CommMergeAccessor");
+  }
+
+  void set_server_size(int server_size) { this->server_size = server_size; }
+  void set_num_node_types(int num_node_types) {
+    this->num_node_types = num_node_types;
+  }
+  int get_server_size(int server_size) { return server_size; }
+  std::vector<std::string> split(std::string& str, const char pattern);
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types);
+
+  void add_table_feat_conf(std::string node_type, std::string feat_name,
+                           std::string feat_dtype, int32_t feat_shape);
+};
+class GraphPyServer : public GraphPyService {
+ public:
+  GraphPyServer() {}
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types, int rank) {
+    set_rank(rank);
+    GraphPyService::set_up(ips_str, shard_num, node_types, edge_types);
+  }
+  int get_rank() { return rank; }
+  void set_rank(int rank) { this->rank = rank; }
+
+  void start_server(bool block = true);
+  ::paddle::distributed::PSParameter GetServerProto();
+  std::shared_ptr<paddle::distributed::GraphBrpcServer> get_ps_server() {
+    return pserver_ptr;
+  }
+
+ protected:
+  int rank;
+  std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr;
+  std::thread* server_thread;
+};
+class GraphPyClient : public GraphPyService {
+ public:
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types, int client_id) {
+    set_client_id(client_id);
+    GraphPyService::set_up(ips_str, shard_num, node_types, edge_types);
+  }
+  std::shared_ptr<paddle::distributed::GraphBrpcClient> get_ps_client() {
+    return worker_ptr;
+  }
+  void bind_local_server(int local_channel_index, GraphPyServer& server) {
+    worker_ptr->set_local_channel(local_channel_index);
+    worker_ptr->set_local_graph_service(
+        (paddle::distributed::GraphBrpcService*)server.get_ps_server()
+            ->get_service());
+  }
+  void stop_server();
+  void finalize_worker();
+  void load_edge_file(std::string name, std::string filepath, bool reverse);
+  void load_node_file(std::string name, std::string filepath);
+  int get_client_id() { return client_id; }
+  void set_client_id(int client_id) { this->client_id = client_id; }
+  void start_client();
+  std::vector<std::vector<std::pair<uint64_t, float>>> batch_sample_neighboors(
+      std::string name, std::vector<uint64_t> node_ids, int sample_size);
+  std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
+                                            int sample_size);
+  std::vector<std::vector<std::string>> get_node_feat(
+      std::string node_type, std::vector<uint64_t> node_ids,
+      std::vector<std::string> feature_names);
+  std::vector<FeatureNode> pull_graph_list(std::string name, int server_index,
+                                           int start, int size, int step = 1);
+  ::paddle::distributed::PSParameter GetWorkerProto();
+
+ protected:
+  mutable std::mutex mutex_;
+  int client_id;
+  std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr;
+  std::thread* client_thread;
+  bool stoped_ = false;
+};
+}
+}
diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc
index 095b5dee0b28e4f3319927aa2440e906489db7de..d45f41a0f58de36bb1575c1b51663f8899fb215d 100644
--- a/paddle/fluid/distributed/service/ps_client.cc
+++ b/paddle/fluid/distributed/service/ps_client.cc
@@ -15,11 +15,15 @@
 #include "paddle/fluid/distributed/service/ps_client.h"
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/ps_local_client.h"
 #include "paddle/fluid/distributed/table/table.h"
 
 namespace paddle {
 namespace distributed {
 REGISTER_PSCORE_CLASS(PSClient, BrpcPsClient);
+REGISTER_PSCORE_CLASS(PSClient, PsLocalClient);
+REGISTER_PSCORE_CLASS(PSClient, GraphBrpcClient);
 
 int32_t PSClient::configure(
     const PSParameter &config,
@@ -78,8 +82,7 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) {
   }
 
   TableManager::instance().initialize();
-  LOG(INFO) << "Create PSClient[" << service_param.client_class()
-            << "] success";
+  VLOG(3) << "Create PSClient[" << service_param.client_class() << "] success";
   return client;
 }
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h
index 50f5802c63a2538566988f75e3c098bd01785294..74a1e0dde71fc4a3dd7af17968502131556d0518 100644
--- a/paddle/fluid/distributed/service/ps_client.h
+++ b/paddle/fluid/distributed/service/ps_client.h
@@ -24,16 +24,11 @@
 #include "paddle/fluid/distributed/service/env.h"
 #include "paddle/fluid/distributed/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
 
 namespace paddle {
 namespace distributed {
 
-class PSEnvironment;
-class PsRequestMessage;
-class PsResponseMessage;
-class ValueAccessor;
-struct Region;
-
 using paddle::distributed::PsRequestMessage;
 using paddle::distributed::PsResponseMessage;
 
@@ -117,10 +112,22 @@ class PSClient {
   // future结束前keys和values缓冲区不能再次使用
   // 整合多个线程请求的keys，聚集并分散发送到server
   // 返回结果后，遍历buffer并对values赋值
+  // is_training 用于区分请求是训练/预测，server端对于特征和准入会有不同的处理.
   virtual std::future<int32_t> pull_sparse(float **select_values,
                                            size_t table_id,
-                                           const uint64_t *keys,
-                                           size_t num) = 0;
+                                           const uint64_t *keys, size_t num,
+                                           bool is_training) = 0;
+
+  virtual ::std::future<int32_t> pull_sparse_ptr(char **select_values,
+                                                 size_t table_id,
+                                                 const uint64_t *keys,
+                                                 size_t num) {
+    VLOG(0) << "Did not implement";
+    std::promise<int32_t> promise;
+    std::future<int> fut = promise.get_future();
+    promise.set_value(-1);
+    return fut;
+  }
 
   virtual std::future<int32_t> print_table_stat(uint32_t table_id) = 0;
 
@@ -154,12 +161,13 @@ class PSClient {
   virtual std::future<int32_t> send_client2client_msg(int msg_type,
                                                       int to_client_id,
                                                       const std::string &msg) {
-    LOG(FATAL) << "Did not implement";
+    VLOG(0) << "Did not implement";
     std::promise<int32_t> promise;
     std::future<int> fut = promise.get_future();
     promise.set_value(-1);
     return fut;
   }
+
   // client2client消息处理，std::function<int32_t (int, int, const std::string&)
   // -> ret (msg_type, from_client_id, msg)
   typedef std::function<int32_t(int, int, const std::string &)> MsgHandlerFunc;
diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/service/ps_local_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2acc845a50890beb834676c3394f8dabc2a77e78
--- /dev/null
+++ b/paddle/fluid/distributed/service/ps_local_client.cc
@@ -0,0 +1,269 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/ps_local_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+
+//#define pslib_debug_dense_compress
+
+namespace paddle {
+namespace distributed {
+int32_t PsLocalClient::initialize() {
+  const auto& downpour_param = _config.server_param().downpour_server_param();
+  TableManager::instance().initialize();
+  for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) {
+    auto* table = CREATE_PSCORE_CLASS(
+        Table, downpour_param.downpour_table_param(i).table_class());
+    table->initialize(downpour_param.downpour_table_param(i),
+                      _config.fs_client_param());
+    table->set_shard(0, 1);
+    _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table);
+  }
+  return 0;
+}
+
+::std::future<int32_t> PsLocalClient::shrink(uint32_t table_id,
+                                             const std::string threshold) {
+  // TODO
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::load(const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  // for (auto& it : _table_map) {
+  //    load(it.first, epoch, mode);
+  //}
+  return done();
+}
+::std::future<int32_t> PsLocalClient::load(uint32_t table_id,
+                                           const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  // auto* table_ptr = table(table_id);
+  // table_ptr->load(epoch, mode);
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::save(const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  for (auto& it : _table_map) {
+    save(it.first, epoch, mode);
+  }
+  return done();
+}
+::std::future<int32_t> PsLocalClient::save(uint32_t table_id,
+                                           const std::string& epoch,
+                                           const std::string& mode) {
+  // TODO
+  auto* table_ptr = table(table_id);
+  table_ptr->flush();
+  table_ptr->save(epoch, mode);
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::clear() {
+  // TODO
+  return done();
+}
+::std::future<int32_t> PsLocalClient::clear(uint32_t table_id) {
+  // TODO
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::flush() {
+  // no need
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::stop_server() {
+  // no need
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::pull_dense(Region* regions,
+                                                 size_t region_num,
+                                                 size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  uint32_t num_per_shard = dense_dim_per_shard(accessor->fea_dim(), 1);
+  std::vector<float> region_buffer;
+  region_buffer.resize(num_per_shard);
+  table_ptr->pull_dense(region_buffer.data(), region_buffer.size());
+
+  size_t region_idx = 0;
+  size_t region_data_idx = 0;
+  size_t shard_data_size = num_per_shard;
+  size_t shard_buffer_remain = shard_data_size * sizeof(float);
+  PADDLE_ENFORCE_EQ(
+      shard_buffer_remain, region_buffer.size() * sizeof(float),
+      platform::errors::PreconditionNotMet("pull dense size error."));
+  size_t index = 0;
+  while (shard_buffer_remain > 0 && region_idx < region_num) {
+    auto& region = regions[region_idx];
+    if (region.size - region_data_idx >= shard_buffer_remain) {
+      memcpy((void*)(region.data + region_data_idx),
+             (uint8_t*)(void*)(region_buffer.data()) + index,
+             shard_buffer_remain);
+      region_data_idx += shard_buffer_remain;
+      shard_buffer_remain = 0;
+    } else if (region.size - region_data_idx == 0) {
+      ++region_idx;
+      region_data_idx = 0;
+    } else {
+      memcpy((void*)(region.data + region_data_idx),
+             (uint8_t*)(void*)(region_buffer.data()) + index,
+             region.size - region_data_idx);
+      shard_buffer_remain -= (region.size - region_data_idx);
+      index += (region.size - region_data_idx);
+      ++region_idx;
+      region_data_idx = 0;
+    }
+  }
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense_param(const Region* regions,
+                                                       size_t region_num,
+                                                       size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  std::vector<float> region_buffer;
+  region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1), 0);
+  for (size_t i = 0, offset = 0; i < region_num; ++i) {
+    uint32_t data_num = regions[i].size / sizeof(float);
+    memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size);
+    offset += data_num;
+  }
+
+  // table_ptr->push_dense_param(region_buffer.data(), region_buffer.size());
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense_raw_gradient(
+    int table_id, float* total_send_data, size_t total_send_data_size,
+    void* callback) {
+  VLOG(1) << "wxx push_dense_raw_gradient";
+
+  PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
+
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_dense(total_send_data, total_send_data_size);
+  delete closure;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_dense(const Region* regions,
+                                                 size_t region_num,
+                                                 size_t table_id) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  std::vector<float> region_buffer;
+  region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1));
+  size_t data_size = region_buffer.size();
+  for (size_t i = 0, offset = 0; i < region_num; ++i) {
+    uint32_t data_num = regions[i].size / sizeof(float);
+    PADDLE_ENFORCE_LE(
+        offset + data_num, data_size,
+        platform::errors::PreconditionNotMet(
+            "invalid dense size, cur pos[%d] data_num[%d] size[%d]", offset,
+            data_num, data_size));
+    memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size);
+    offset += data_num;
+  }
+
+  table_ptr->push_dense(region_buffer.data(), region_buffer.size());
+
+  return done();
+}
+
+//::std::future<int32_t> PsLocalClient::pull_sparse(float** select_values,
+//                                                  size_t table_id,
+//                                                  const uint64_t* keys,
+//                                                  size_t num) {
+//  // FIXME
+//  // auto timer =
+//  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
+//  // auto local_timer =
+//  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse_local");
+//  //将key拆分到各shard请求，并记录原始对应value指针
+//  auto* accessor = table_accessor(table_id);
+//  auto* table_ptr = table(table_id);
+//  size_t value_size = accessor->select_size();
+//
+//  // table_ptr->pull_sparse(keys, num);
+//  std::vector<float> res_data;
+//  res_data.resize(num * value_size / sizeof(float));
+//  table_ptr->pull_sparse(res_data.data(), keys, num);
+//  // memcpy(select_values[0], res_data->data(), res_data->size() *
+//  // sizeof(float));
+//  size_t offset = 0;
+//  for (int i = 0; i < num; ++i) {
+//    memcpy(select_values[i], (char*)res_data.data() + offset, value_size);
+//    offset += value_size;
+//  }
+//
+//  // return fut;
+//  return done();
+//}
+
+::std::future<int32_t> PsLocalClient::pull_sparse_ptr(char** select_values,
+                                                      size_t table_id,
+                                                      const uint64_t* keys,
+                                                      size_t num) {
+  // FIXME
+  // auto timer =
+  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse");
+  // auto local_timer =
+  // std::make_shared<CostTimer>("pslib_downpour_client_pull_sparse_local");
+  //将key拆分到各shard请求，并记录原始对应value指针
+  auto* table_ptr = table(table_id);
+
+  table_ptr->pull_sparse_ptr(select_values, keys, num);
+
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_sparse_raw_gradient(
+    size_t table_id, const uint64_t* keys, const float** update_values,
+    size_t num, void* callback) {
+  VLOG(1) << "wxx push_sparse_raw_gradient";
+  PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_sparse(keys, update_values, num);
+  delete closure;
+  return done();
+}
+
+::std::future<int32_t> PsLocalClient::push_sparse(size_t table_id,
+                                                  const uint64_t* keys,
+                                                  const float** update_values,
+                                                  size_t num) {
+  auto* accessor = table_accessor(table_id);
+  auto* table_ptr = table(table_id);
+
+  table_ptr->push_sparse(keys, update_values, num);
+  return done();
+}
+}
+}
diff --git a/paddle/fluid/distributed/service/ps_local_client.h b/paddle/fluid/distributed/service/ps_local_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d2b01a45fe929097c06fb264f470974410e7f4e
--- /dev/null
+++ b/paddle/fluid/distributed/service/ps_local_client.h
@@ -0,0 +1,226 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License 0//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+
+namespace paddle {
+namespace distributed {
+
+class Table;
+
+class PsLocalClient : public PSClient {
+ public:
+  PsLocalClient() {}
+  virtual ~PsLocalClient() { _running = false; }
+  virtual int32_t create_client2client_connection(int pslib_timeout_ms,
+                                                  int pslib_connect_timeout_ms,
+                                                  int max_retry) {
+    return 0;
+  }
+
+  virtual ::std::future<int32_t> shrink(uint32_t table_id,
+                                        const std::string threshold) override;
+  virtual ::std::future<int32_t> load(const std::string& epoch,
+                                      const std::string& mode) override;
+  virtual ::std::future<int32_t> load(uint32_t table_id,
+                                      const std::string& epoch,
+                                      const std::string& mode) override;
+
+  virtual ::std::future<int32_t> save(const std::string& epoch,
+                                      const std::string& mode) override;
+  virtual ::std::future<int32_t> save(uint32_t table_id,
+                                      const std::string& epoch,
+                                      const std::string& mode) override;
+
+  virtual ::std::future<int32_t> clear() override;
+  virtual ::std::future<int32_t> clear(uint32_t table_id) override;
+
+  virtual ::std::future<int32_t> stop_server() override;
+
+  virtual void finalize_worker() override {}
+  virtual ::std::future<int32_t> pull_dense(Region* regions, size_t region_num,
+                                            size_t table_id);
+
+  virtual ::std::future<int32_t> push_dense(const Region* regions,
+                                            size_t region_num, size_t table_id);
+
+  virtual ::std::future<int32_t> push_dense_param(const Region* regions,
+                                                  size_t region_num,
+                                                  size_t table_id);
+
+  virtual ::std::future<int32_t> pull_sparse(float** select_values,
+                                             size_t table_id,
+                                             const uint64_t* keys, size_t num,
+                                             bool is_training) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual ::std::future<int32_t> pull_sparse_ptr(char** select_values,
+                                                 size_t table_id,
+                                                 const uint64_t* keys,
+                                                 size_t num);
+
+  virtual ::std::future<int32_t> print_table_stat(uint32_t table_id) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+  virtual ::std::future<int32_t> push_sparse(size_t table_id,
+                                             const uint64_t* keys,
+                                             const float** update_values,
+                                             size_t num);
+
+  virtual ::std::future<int32_t> flush();
+  // server profilera
+  virtual std::future<int32_t> start_profiler() {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  };
+
+  virtual std::future<int32_t> stop_profiler() {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> barrier(size_t table_id, uint32_t barrier_type) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> pull_geo_param(size_t table_id,
+                                              std::vector<float>* values,
+                                              std::vector<uint64_t>* keys,
+                                              int pserver_idx) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> push_global_step(int table_id,
+                                                int64_t* total_send_data,
+                                                void* done) {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  // recv table from server and save it in LodTensor
+  virtual int32_t recv_and_save_table(const uint64_t table_id,
+                                      const std::string& path) {
+    return 0;
+  }
+
+  virtual ::std::future<int32_t> send_client2client_msg(
+      int msg_type, int to_client_id, const std::string& msg) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+  virtual size_t get_server_nums() { return 1; }
+
+  virtual std::future<int32_t> push_dense_raw_gradient(
+      int table_id, float* total_send_data, size_t total_send_data_size,
+      void* callback) override;
+
+  virtual std::future<int32_t> push_sparse_raw_gradient(
+      size_t table_id, const uint64_t* keys, const float** update_values,
+      size_t num, void* callback) override;
+
+  virtual std::future<int32_t> push_sparse_raw_gradient_partial(
+      size_t table_id, const uint64_t* keys, const float** update_values,
+      uint32_t num, void* done, int pserver_idx) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+  virtual std::future<int32_t> push_sparse_param(size_t table_id,
+                                                 const uint64_t* keys,
+                                                 const float** update_values,
+                                                 size_t num,
+                                                 void* done) override {
+    std::promise<int32_t> prom;
+    std::future<int32_t> fut = prom.get_future();
+    prom.set_value(0);
+
+    return fut;
+  }
+
+ private:
+  virtual int32_t initialize() override;
+
+  std::future<int32_t> done() {
+    std::shared_ptr<std::promise<int32_t>> prom =
+        std::make_shared<std::promise<int32_t>>();
+    std::future<int32_t> fut = prom->get_future();
+    prom->set_value(0);
+    return fut;
+  }
+
+  inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
+                                      uint32_t shard_num) {
+    return dense_dim_total / shard_num + 1;
+  }
+
+  inline std::unordered_map<uint32_t, std::shared_ptr<Table>>* table() {
+    return &_table_map;
+  }
+
+  inline Table* table(size_t table_id) {
+    auto itr = _table_map.find(table_id);
+    if (itr != _table_map.end()) {
+      return itr->second.get();
+    }
+    LOG(ERROR) << "table not found " << table_id;
+    return NULL;
+  }
+
+  std::unordered_map<uint32_t, std::shared_ptr<Table>> _table_map;
+
+  bool _running = false;
+  bool _flushing = false;
+
+ private:
+  float _mae = 0;
+  float _mse = 0;
+  uint16_t _push_times = 0;
+};
+}
+}
diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/distributed/service/ps_local_server.h
similarity index 56%
rename from paddle/fluid/operators/distributed/parameter_send.h
rename to paddle/fluid/distributed/service/ps_local_server.h
index 4335ef8c73cc0a3f4d019cbfe9be078a88914217..dfbccc70900e3cf10fbb0852a114e400d738e2d6 100644
--- a/paddle/fluid/operators/distributed/parameter_send.h
+++ b/paddle/fluid/distributed/service/ps_local_server.h
@@ -1,4 +1,4 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,22 +14,24 @@
 
 #pragma once
 
-#include <string>
+#include <memory>
 #include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
+#include "paddle/fluid/distributed/service/server.h"
 
 namespace paddle {
-namespace operators {
 namespace distributed {
 
-template <typename T>
-struct ParameterSend {
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope,
-                  bool sync, int multi_parts);
-};
+class PsLocalServer : public PSServer {
+ public:
+  PsLocalServer() {}
+  virtual ~PsLocalServer() {}
+  virtual uint64_t start() { return 0; }
+  virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; }
+  virtual int32_t stop() { return 0; }
+  virtual int32_t port() { return 0; }
 
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
+ private:
+  virtual int32_t initialize() { return 0; }
+};
+}
+}
diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto
index 6250f84c98754d31b6f0a4cf6689e4a560549f2c..d908c26da9870a93d81c0242ac03e26cfebdb976 100644
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
@@ -48,6 +48,10 @@ enum PsCmdID {
   PS_START_PROFILER = 27;
   PS_STOP_PROFILER = 28;
   PS_PUSH_GLOBAL_STEP = 29;
+  PS_PULL_GRAPH_LIST = 30;
+  PS_GRAPH_SAMPLE_NEIGHBOORS = 31;
+  PS_GRAPH_SAMPLE_NODES = 32;
+  PS_GRAPH_GET_NODE_FEAT = 33;
 }
 
 message PsRequestMessage {
@@ -111,4 +115,4 @@ message MultiVariableMessage {
 service PsService {
   rpc service(PsRequestMessage) returns (PsResponseMessage);
   rpc SendAndRecvVariable(MultiVariableMessage) returns (MultiVariableMessage);
-};
\ No newline at end of file
+};
diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/service/server.cc
index fc230a0b9c92e646f3dc87231effb7462f2340b6..e44876e3d2b789580152626ea8c290db0d369509 100644
--- a/paddle/fluid/distributed/service/server.cc
+++ b/paddle/fluid/distributed/service/server.cc
@@ -16,13 +16,18 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/ps_local_server.h"
 #include "paddle/fluid/distributed/table/table.h"
 
 namespace paddle {
 namespace distributed {
 
 REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer);
+REGISTER_PSCORE_CLASS(PSServer, PsLocalServer);
 REGISTER_PSCORE_CLASS(PsBaseService, BrpcPsService);
+REGISTER_PSCORE_CLASS(PSServer, GraphBrpcServer);
+REGISTER_PSCORE_CLASS(PsBaseService, GraphBrpcService);
 
 PSServer *PSServerFactory::create(const PSParameter &ps_config) {
   const auto &config = ps_config.server_param();
diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc
index 3d0f94fac277509be7b17b648767dc835629b2b3..2759e4614e66e1d69c6427e0320ae44292757ffd 100644
--- a/paddle/fluid/distributed/service/service.cc
+++ b/paddle/fluid/distributed/service/service.cc
@@ -47,7 +47,7 @@ paddle::distributed::PSParameter load_from_prototxt(
 }
 
 void PSCore::init_gflag(const std::string& gflags) {
-  LOG(INFO) << "Init With Gflags:" << gflags;
+  VLOG(3) << "Init With Gflags:" << gflags;
   std::vector<std::string> flags = paddle::string::split_string(gflags);
   if (flags.size() < 1) {
     flags.push_back("-max_body_size=314217728");
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index 1e98e193d54ae6dc6e8a2d9981071283354e7f98..dab390958034af284baaffcb909d8b941fc3b9d1 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -1,13 +1,23 @@
 set_property(GLOBAL PROPERTY TABLE_DEPS string_helper)
-
+set(graphDir graph)
 get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS)
-
+set_source_files_properties(${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_edge SRCS ${graphDir}/graph_edge.cc)
+set_source_files_properties(${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(WeightedSampler SRCS ${graphDir}/graph_weighted_sampler.cc DEPS graph_edge)
+set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler)
 set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
+get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
-cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc DEPS ${TABLE_DEPS} device_context string_helper simple_threadpool xxhash generator)
+cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc
+sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS}
+${RPC_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)
 
 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0dc99de1bfe82a691fdacb834acd1ad606dcb04b
--- /dev/null
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -0,0 +1,506 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/common_graph_table.h"
+#include <time.h>
+#include <algorithm>
+#include <set>
+#include <sstream>
+#include "paddle/fluid/distributed/common/utils.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+
+std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
+  if (start < 0) start = 0;
+  std::vector<Node *> res;
+  for (int pos = start; pos < std::min(end, (int)bucket.size()); pos += step) {
+    res.push_back(bucket[pos]);
+  }
+  return res;
+}
+
+size_t GraphShard::get_size() { return bucket.size(); }
+
+GraphNode *GraphShard::add_graph_node(uint64_t id) {
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(new GraphNode(id));
+  }
+  return (GraphNode *)bucket[node_location[id]];
+}
+
+FeatureNode *GraphShard::add_feature_node(uint64_t id) {
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(new FeatureNode(id));
+  }
+  return (FeatureNode *)bucket[node_location[id]];
+}
+
+void GraphShard::add_neighboor(uint64_t id, uint64_t dst_id, float weight) {
+  find_node(id)->add_edge(dst_id, weight);
+}
+
+Node *GraphShard::find_node(uint64_t id) {
+  auto iter = node_location.find(id);
+  return iter == node_location.end() ? nullptr : bucket[iter->second];
+}
+
+int32_t GraphTable::load(const std::string &path, const std::string &param) {
+  bool load_edge = (param[0] == 'e');
+  bool load_node = (param[0] == 'n');
+  if (load_edge) {
+    bool reverse_edge = (param[1] == '<');
+    return this->load_edges(path, reverse_edge);
+  }
+  if (load_node) {
+    std::string node_type = param.substr(1);
+    return this->load_nodes(path, node_type);
+  }
+  return 0;
+}
+
+int32_t GraphTable::get_nodes_ids_by_ranges(
+    std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res) {
+  int start = 0, end, index = 0, total_size = 0;
+  res.clear();
+  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  // std::string temp = "";
+  // for(int i = 0;i < shards.size();i++)
+  //   temp+= std::to_string((int)shards[i].get_size()) + " ";
+  // VLOG(0)<<"range distribution "<<temp;
+  for (int i = 0; i < shards.size() && index < ranges.size(); i++) {
+    end = total_size + shards[i].get_size();
+    start = total_size;
+    while (start < end && index < ranges.size()) {
+      if (ranges[index].second <= start)
+        index++;
+      else if (ranges[index].first >= end) {
+        break;
+      } else {
+        int first = std::max(ranges[index].first, start);
+        int second = std::min(ranges[index].second, end);
+        start = second;
+        first -= total_size;
+        second -= total_size;
+        // VLOG(0)<<" FIND RANGE "<<i<<" "<<first<<" "<<second;
+        tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+            [this, first, second, i]() -> std::vector<uint64_t> {
+              return shards[i].get_ids_by_range(first, second);
+            }));
+      }
+    }
+    total_size += shards[i].get_size();
+  }
+  for (int i = 0; i < tasks.size(); i++) {
+    auto vec = tasks[i].get();
+    for (auto &id : vec) {
+      res.push_back(id);
+      std::swap(res[rand() % res.size()], res[(int)res.size() - 1]);
+    }
+  }
+  return 0;
+}
+
+int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int64_t count = 0;
+  int64_t valid_count = 0;
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      count++;
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      if (values.size() < 2) continue;
+      auto id = std::stoull(values[1]);
+
+      size_t shard_id = id % shard_num;
+      if (shard_id >= shard_end || shard_id < shard_start) {
+        VLOG(4) << "will not load " << id << " from " << path
+                << ", please check id distribution";
+        continue;
+      }
+
+      if (count % 1000000 == 0) {
+        VLOG(0) << count << " nodes are loaded from filepath";
+      }
+
+      std::string nt = values[0];
+      if (nt != node_type) {
+        continue;
+      }
+
+      size_t index = shard_id - shard_start;
+
+      auto node = shards[index].add_feature_node(id);
+
+      node->set_feature_size(feat_name.size());
+
+      for (size_t slice = 2; slice < values.size(); slice++) {
+        auto feat = this->parse_feature(values[slice]);
+        if (feat.first >= 0) {
+          node->set_feature(feat.first, feat.second);
+        } else {
+          VLOG(4) << "Node feature:  " << values[slice]
+                  << " not in feature_map.";
+        }
+      }
+      valid_count++;
+    }
+  }
+
+  VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type
+          << " are loaded successfully in " << path;
+  return 0;
+}
+
+int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int64_t count = 0;
+  std::string sample_type = "random";
+  bool is_weighted = false;
+  int valid_count = 0;
+
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      count++;
+      if (values.size() < 2) continue;
+      auto src_id = std::stoull(values[0]);
+      auto dst_id = std::stoull(values[1]);
+      if (reverse_edge) {
+        std::swap(src_id, dst_id);
+      }
+      float weight = 1;
+      if (values.size() == 3) {
+        weight = std::stof(values[2]);
+        sample_type = "weighted";
+        is_weighted = true;
+      }
+
+      size_t src_shard_id = src_id % shard_num;
+
+      if (src_shard_id >= shard_end || src_shard_id < shard_start) {
+        VLOG(4) << "will not load " << src_id << " from " << path
+                << ", please check id distribution";
+        continue;
+      }
+      if (count % 1000000 == 0) {
+        VLOG(0) << count << " edges are loaded from filepath";
+      }
+
+      size_t index = src_shard_id - shard_start;
+      shards[index].add_graph_node(src_id)->build_edges(is_weighted);
+      shards[index].add_neighboor(src_id, dst_id, weight);
+      valid_count++;
+    }
+  }
+  VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
+          << path;
+
+  // Build Sampler j
+
+  for (auto &shard : shards) {
+    auto bucket = shard.get_bucket();
+    for (int i = 0; i < bucket.size(); i++) {
+      bucket[i]->build_sampler(sample_type);
+    }
+  }
+  return 0;
+}
+
+Node *GraphTable::find_node(uint64_t id) {
+  size_t shard_id = id % shard_num;
+  if (shard_id >= shard_end || shard_id < shard_start) {
+    return nullptr;
+  }
+  size_t index = shard_id - shard_start;
+  Node *node = shards[index].find_node(id);
+  return node;
+}
+uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
+  return node_id % shard_num % shard_num_per_table % task_pool_size_;
+}
+int32_t GraphTable::random_sample_nodes(int sample_size,
+                                        std::unique_ptr<char[]> &buffer,
+                                        int &actual_size) {
+  bool need_feature = false;
+  int total_size = 0;
+  for (int i = 0; i < shards.size(); i++) {
+    total_size += shards[i].get_size();
+  }
+  if (sample_size > total_size) sample_size = total_size;
+  int range_num = random_sample_nodes_ranges;
+  if (range_num > sample_size) range_num = sample_size;
+  if (sample_size == 0 || range_num == 0) return 0;
+  std::vector<int> ranges_len, ranges_pos;
+  int remain = sample_size, last_pos = -1, num;
+  std::set<int> separator_set;
+  for (int i = 0; i < range_num - 1; i++) {
+    while (separator_set.find(num = rand() % (sample_size - 1)) !=
+           separator_set.end())
+      ;
+    separator_set.insert(num);
+  }
+  for (auto p : separator_set) {
+    ranges_len.push_back(p - last_pos);
+    last_pos = p;
+  }
+  ranges_len.push_back(sample_size - 1 - last_pos);
+  remain = total_size - sample_size + range_num;
+  separator_set.clear();
+  for (int i = 0; i < range_num; i++) {
+    while (separator_set.find(num = rand() % remain) != separator_set.end())
+      ;
+    separator_set.insert(num);
+  }
+  int used = 0, index = 0;
+  last_pos = -1;
+  for (auto p : separator_set) {
+    used += p - last_pos - 1;
+    last_pos = p;
+    ranges_pos.push_back(used);
+    used += ranges_len[index++];
+  }
+  std::vector<std::pair<int, int>> first_half, second_half;
+  int start_index = rand() % total_size;
+  for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) {
+    if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size)
+      first_half.push_back({ranges_pos[i] + start_index,
+                            ranges_pos[i] + ranges_len[i] + start_index});
+    else if (ranges_pos[i] + start_index >= total_size) {
+      second_half.push_back(
+          {ranges_pos[i] + start_index - total_size,
+           ranges_pos[i] + ranges_len[i] + start_index - total_size});
+    } else {
+      first_half.push_back({ranges_pos[i] + start_index, total_size});
+      second_half.push_back(
+          {0, ranges_pos[i] + ranges_len[i] + start_index - total_size});
+    }
+  }
+  for (auto &pair : first_half) second_half.push_back(pair);
+  std::vector<uint64_t> res;
+  get_nodes_ids_by_ranges(second_half, res);
+  actual_size = res.size() * sizeof(uint64_t);
+  buffer.reset(new char[actual_size]);
+  char *pointer = buffer.get();
+  memcpy(pointer, res.data(), actual_size);
+  return 0;
+}
+int32_t GraphTable::random_sample_neighboors(
+    uint64_t *node_ids, int sample_size,
+    std::vector<std::unique_ptr<char[]>> &buffers,
+    std::vector<int> &actual_sizes) {
+  size_t node_num = buffers.size();
+  std::vector<std::future<int>> tasks;
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    uint64_t &node_id = node_ids[idx];
+    std::unique_ptr<char[]> &buffer = buffers[idx];
+    int &actual_size = actual_sizes[idx];
+    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
+        [&]() -> int {
+          Node *node = find_node(node_id);
+
+          if (node == nullptr) {
+            actual_size = 0;
+            return 0;
+          }
+          std::vector<int> res = node->sample_k(sample_size);
+          actual_size = res.size() * (Node::id_size + Node::weight_size);
+          int offset = 0;
+          uint64_t id;
+          float weight;
+          char *buffer_addr = new char[actual_size];
+          buffer.reset(buffer_addr);
+          for (int &x : res) {
+            id = node->get_neighbor_id(x);
+            weight = node->get_neighbor_weight(x);
+            memcpy(buffer_addr + offset, &id, Node::id_size);
+            offset += Node::id_size;
+            memcpy(buffer_addr + offset, &weight, Node::weight_size);
+            offset += Node::weight_size;
+          }
+          return 0;
+        }));
+  }
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    tasks[idx].get();
+  }
+  return 0;
+}
+
+int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
+                                  const std::vector<std::string> &feature_names,
+                                  std::vector<std::vector<std::string>> &res) {
+  size_t node_num = node_ids.size();
+  std::vector<std::future<int>> tasks;
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    uint64_t node_id = node_ids[idx];
+    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
+        [&, idx, node_id]() -> int {
+          Node *node = find_node(node_id);
+
+          if (node == nullptr) {
+            return 0;
+          }
+          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+            const std::string &feature_name = feature_names[feat_idx];
+            if (feat_id_map.find(feature_name) != feat_id_map.end()) {
+              // res[feat_idx][idx] =
+              // node->get_feature(feat_id_map[feature_name]);
+              auto feat = node->get_feature(feat_id_map[feature_name]);
+              res[feat_idx][idx] = feat;
+            }
+          }
+          return 0;
+        }));
+  }
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    tasks[idx].get();
+  }
+  return 0;
+}
+
+std::pair<int32_t, std::string> GraphTable::parse_feature(
+    std::string feat_str) {
+  // Return (feat_id, btyes) if name are in this->feat_name, else return (-1,
+  // "")
+  auto fields = paddle::string::split_string<std::string>(feat_str, " ");
+  if (this->feat_id_map.count(fields[0])) {
+    int32_t id = this->feat_id_map[fields[0]];
+    std::string dtype = this->feat_dtype[id];
+    int32_t shape = this->feat_shape[id];
+    std::vector<std::string> values(fields.begin() + 1, fields.end());
+    if (dtype == "feasign") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), paddle::string::join_strings(values, ' '));
+    } else if (dtype == "string") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), paddle::string::join_strings(values, ' '));
+    } else if (dtype == "float32") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<float>(values));
+    } else if (dtype == "float64") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<double>(values));
+    } else if (dtype == "int32") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<int32_t>(values));
+    } else if (dtype == "int64") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<int64_t>(values));
+    }
+  }
+  return std::make_pair<int32_t, std::string>(-1, "");
+}
+
+int32_t GraphTable::pull_graph_list(int start, int total_size,
+                                    std::unique_ptr<char[]> &buffer,
+                                    int &actual_size, bool need_feature,
+                                    int step) {
+  if (start < 0) start = 0;
+  int size = 0, cur_size;
+  std::vector<std::future<std::vector<Node *>>> tasks;
+  for (size_t i = 0; i < shards.size() && total_size > 0; i++) {
+    cur_size = shards[i].get_size();
+    if (size + cur_size <= start) {
+      size += cur_size;
+      continue;
+    }
+    int count = std::min(1 + (size + cur_size - start - 1) / step, total_size);
+    int end = start + (count - 1) * step + 1;
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [this, i, start, end, step, size]() -> std::vector<Node *> {
+
+          return this->shards[i].get_batch(start - size, end - size, step);
+        }));
+    start += count * step;
+    total_size -= count;
+    size += cur_size;
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  size = 0;
+  std::vector<std::vector<Node *>> res;
+  for (size_t i = 0; i < tasks.size(); i++) {
+    res.push_back(tasks[i].get());
+    for (size_t j = 0; j < res.back().size(); j++) {
+      size += res.back()[j]->get_size(need_feature);
+    }
+  }
+  char *buffer_addr = new char[size];
+  buffer.reset(buffer_addr);
+  int index = 0;
+  for (size_t i = 0; i < res.size(); i++) {
+    for (size_t j = 0; j < res[i].size(); j++) {
+      res[i][j]->to_buffer(buffer_addr + index, need_feature);
+      index += res[i][j]->get_size(need_feature);
+    }
+  }
+  actual_size = size;
+  return 0;
+}
+int32_t GraphTable::initialize() {
+  _shards_task_pool.resize(task_pool_size_);
+  for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
+  }
+  server_num = _shard_num;
+  // VLOG(0) << "in init graph table server num = " << server_num;
+  /*
+  _shard_num is actually server number here
+  when a server initialize its tables, it sets tables' _shard_num to server_num,
+  and _shard_idx to server
+  rank
+  */
+  auto common = _config.common();
+
+  this->table_name = common.table_name();
+  this->table_type = common.name();
+  VLOG(0) << " init graph table type " << this->table_type << " table name "
+          << this->table_name;
+  int feat_conf_size = static_cast<int>(common.attributes().size());
+  for (int i = 0; i < feat_conf_size; i++) {
+    auto &f_name = common.attributes()[i];
+    auto &f_shape = common.dims()[i];
+    auto &f_dtype = common.params()[i];
+    this->feat_name.push_back(f_name);
+    this->feat_shape.push_back(f_shape);
+    this->feat_dtype.push_back(f_dtype);
+    this->feat_id_map[f_name] = i;
+    VLOG(0) << "init graph table feat conf name:" << f_name
+            << " shape:" << f_shape << " dtype:" << f_dtype;
+  }
+
+  shard_num = _config.shard_num();
+  VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx"
+          << _shard_idx;
+  shard_num_per_table = sparse_local_shard_num(shard_num, server_num);
+  shard_start = _shard_idx * shard_num_per_table;
+  shard_end = shard_start + shard_num_per_table;
+  VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start "
+          << shard_start << " shard_end " << shard_end;
+  // shards.resize(shard_num_per_table);
+  shards = std::vector<GraphShard>(shard_num_per_table, GraphShard(shard_num));
+  return 0;
+}
+}
+};
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
new file mode 100644
index 0000000000000000000000000000000000000000..b18da82abe61c9695712f542e187ac48fd5edc9d
--- /dev/null
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <assert.h>
+#include <pthread.h>
+#include <list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/common_table.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+class GraphShard {
+ public:
+  size_t get_size();
+  GraphShard() {}
+  GraphShard(int shard_num) { this->shard_num = shard_num; }
+  std::vector<Node *> &get_bucket() { return bucket; }
+  std::vector<Node *> get_batch(int start, int end, int step);
+  std::vector<uint64_t> get_ids_by_range(int start, int end) {
+    std::vector<uint64_t> res;
+    for (int i = start; i < end && i < bucket.size(); i++) {
+      res.push_back(bucket[i]->get_id());
+    }
+    return res;
+  }
+  GraphNode *add_graph_node(uint64_t id);
+  FeatureNode *add_feature_node(uint64_t id);
+  Node *find_node(uint64_t id);
+  void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
+  std::unordered_map<uint64_t, int> get_node_location() {
+    return node_location;
+  }
+
+ private:
+  std::unordered_map<uint64_t, int> node_location;
+  int shard_num;
+  std::vector<Node *> bucket;
+};
+class GraphTable : public SparseTable {
+ public:
+  GraphTable() {}
+  virtual ~GraphTable() {}
+  virtual int32_t pull_graph_list(int start, int size,
+                                  std::unique_ptr<char[]> &buffer,
+                                  int &actual_size, bool need_feature,
+                                  int step);
+
+  virtual int32_t random_sample_neighboors(
+      uint64_t *node_ids, int sample_size,
+      std::vector<std::unique_ptr<char[]>> &buffers,
+      std::vector<int> &actual_sizes);
+
+  int32_t random_sample_nodes(int sample_size, std::unique_ptr<char[]> &buffers,
+                              int &actual_sizes);
+
+  virtual int32_t get_nodes_ids_by_ranges(
+      std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res);
+  virtual int32_t initialize();
+
+  int32_t load(const std::string &path, const std::string &param);
+
+  int32_t load_edges(const std::string &path, bool reverse);
+
+  int32_t load_nodes(const std::string &path, std::string node_type);
+
+  Node *find_node(uint64_t id);
+
+  virtual int32_t pull_sparse(float *values,
+                              const PullSparseValue &pull_value) {
+    return 0;
+  }
+
+  virtual int32_t push_sparse(const uint64_t *keys, const float *values,
+                              size_t num) {
+    return 0;
+  }
+
+  virtual void clear() {}
+  virtual int32_t flush() { return 0; }
+  virtual int32_t shrink(const std::string &param) { return 0; }
+  //指定保存路径
+  virtual int32_t save(const std::string &path, const std::string &converter) {
+    return 0;
+  }
+  virtual int32_t initialize_shard() { return 0; }
+  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
+
+  virtual int32_t get_node_feat(const std::vector<uint64_t> &node_ids,
+                                const std::vector<std::string> &feature_names,
+                                std::vector<std::vector<std::string>> &res);
+
+ protected:
+  std::vector<GraphShard> shards;
+  size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num;
+  const int task_pool_size_ = 24;
+  const int random_sample_nodes_ranges = 3;
+
+  std::vector<std::string> feat_name;
+  std::vector<std::string> feat_dtype;
+  std::vector<int32_t> feat_shape;
+  std::unordered_map<std::string, int32_t> feat_id_map;
+  std::string table_name;
+  std::string table_type;
+
+  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
+};
+}  // namespace distributed
+};  // namespace paddle
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index ffedbea14a0290730b9d785464a84e3c4536a9e7..718fce9950719fb99e9831bad9490610ec3834cf 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -125,34 +125,37 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
 
 int64_t SaveToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
                    const int mode) {
-  int64_t not_save_num = 0;
-  for (auto value : block->values_) {
-    if (mode == SaveMode::delta && !value.second->need_save_) {
-      not_save_num++;
-      continue;
-    }
-
-    auto* vs = value.second->data_.data();
-    std::stringstream ss;
-    auto id = value.first;
-    ss << id << "\t" << value.second->count_ << "\t"
-       << value.second->unseen_days_ << "\t" << value.second->is_entry_ << "\t";
-
-    for (int i = 0; i < block->value_length_; i++) {
-      ss << vs[i];
-      ss << ",";
-    }
+  int64_t save_num = 0;
+  for (auto& table : block->values_) {
+    for (auto& value : table) {
+      if (mode == SaveMode::delta && !value.second->need_save_) {
+        continue;
+      }
+      save_num += 1;
+
+      auto* vs = value.second->data_.data();
+      std::stringstream ss;
+      auto id = value.first;
+      ss << id << "\t" << value.second->count_ << "\t"
+         << value.second->unseen_days_ << "\t" << value.second->is_entry_
+         << "\t";
+
+      for (int i = 0; i < block->value_length_; i++) {
+        ss << vs[i];
+        ss << ",";
+      }
 
-    ss << "\n";
+      ss << "\n";
 
-    os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
+      os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
 
-    if (mode == SaveMode::base || mode == SaveMode::delta) {
-      value.second->need_save_ = false;
+      if (mode == SaveMode::base || mode == SaveMode::delta) {
+        value.second->need_save_ = false;
+      }
     }
   }
 
-  return block->values_.size() - not_save_num;
+  return save_num;
 }
 
 int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
@@ -183,7 +186,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
 
     block->Init(id, false);
 
-    auto value_instant = block->GetValue(id);
+    VALUE* value_instant = block->GetValue(id);
     if (values.size() == 5) {
       value_instant->count_ = std::stoi(values[1]);
       value_instant->unseen_days_ = std::stoi(values[2]);
@@ -254,7 +257,6 @@ int32_t CommonSparseTable::initialize_value() {
   }
 
   auto accessor = _config.accessor();
-
   std::vector<uint64_t> feasigns;
 
   for (size_t x = 0; x < accessor.fea_dim(); ++x) {
@@ -271,9 +273,14 @@ int32_t CommonSparseTable::initialize_value() {
     std::vector<uint64_t> ids(bucket_feasigns);
     std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1],
               ids.begin());
+
+    std::vector<uint32_t> fres;
+    fres.resize(ids.size(), 1);
+
+    auto pull_value = PullSparseValue(ids, fres, param_dim_);
     std::vector<float> pulls;
     pulls.resize(bucket_feasigns * param_dim_);
-    pull_sparse(pulls.data(), ids.data(), bucket_feasigns);
+    pull_sparse(pulls.data(), pull_value);
   }
 
   return 0;
@@ -369,8 +376,10 @@ std::pair<int64_t, int64_t> CommonSparseTable::print_table_stat() {
   int64_t feasign_size = 0;
   int64_t mf_size = 0;
 
-  for (auto& value : shard_values_) {
-    feasign_size += value->values_.size();
+  for (auto& shard : shard_values_) {
+    for (auto& table : shard->values_) {
+      feasign_size += table.size();
+    }
   }
 
   return {feasign_size, mf_size};
@@ -399,10 +408,51 @@ int32_t CommonSparseTable::pour() {
   return 0;
 }
 
-int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys,
-                                       size_t num) {
+int32_t CommonSparseTable::pull_sparse(float* pull_values,
+                                       const PullSparseValue& pull_value) {
   rwlock_->RDLock();
 
+  auto shard_num = task_pool_size_;
+  std::vector<std::future<int>> tasks(shard_num);
+
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, shard_num, &pull_value, &pull_values]() -> int {
+          auto& block = shard_values_[shard_id];
+
+          std::vector<int> offsets;
+          pull_value.Fission(shard_id, shard_num, &offsets);
+
+          if (pull_value.is_training_) {
+            for (auto& offset : offsets) {
+              auto feasign = pull_value.feasigns_[offset];
+              auto frequencie = pull_value.frequencies_[offset];
+              auto* value = block->Init(feasign, true, frequencie);
+              std::copy_n(value + param_offset_, param_dim_,
+                          pull_values + param_dim_ * offset);
+            }
+          } else {
+            for (auto& offset : offsets) {
+              auto feasign = pull_value.feasigns_[offset];
+              auto* value = block->Init(feasign, false);
+              std::copy_n(value + param_offset_, param_dim_,
+                          pull_values + param_dim_ * offset);
+            }
+          }
+
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  rwlock_->UNLock();
+  return 0;
+}
+
+int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values,
+                                           const uint64_t* keys, size_t num) {
   std::vector<std::vector<uint64_t>> offset_bucket;
   offset_bucket.resize(task_pool_size_);
 
@@ -422,9 +472,10 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys,
           for (int i = 0; i < offsets.size(); ++i) {
             auto offset = offsets[i];
             auto id = keys[offset];
-            auto* value = block->Init(id);
-            std::copy_n(value + param_offset_, param_dim_,
-                        pull_values + param_dim_ * offset);
+            auto* value = block->InitGet(id);
+            // std::copy_n(value + param_offset_, param_dim_,
+            //            pull_values + param_dim_ * offset);
+            pull_values[offset] = (char*)value;
           }
 
           return 0;
@@ -434,7 +485,6 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys,
   for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
     tasks[shard_id].wait();
   }
-  rwlock_->UNLock();
   return 0;
 }
 
@@ -494,6 +544,45 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys,
   return 0;
 }
 
+int32_t CommonSparseTable::push_sparse(const uint64_t* keys,
+                                       const float** values, size_t num) {
+  _push_sparse(keys, values, num);
+  return 0;
+}
+
+int32_t CommonSparseTable::_push_sparse(const uint64_t* keys,
+                                        const float** values, size_t num) {
+  rwlock_->RDLock();
+  std::vector<std::vector<uint64_t>> offset_bucket;
+  offset_bucket.resize(task_pool_size_);
+
+  for (int x = 0; x < num; ++x) {
+    auto y = keys[x] % task_pool_size_;
+    offset_bucket[y].push_back(x);
+  }
+
+  std::vector<std::future<int>> tasks(task_pool_size_);
+
+  for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &keys, &values, num, &offset_bucket]() -> int {
+          auto& offsets = offset_bucket[shard_id];
+          for (size_t i = 0; i < offsets.size(); ++i) {
+            std::vector<uint64_t> tmp_off = {0};
+            optimizer_->update(keys + offsets[i], values[offsets[i]], num,
+                               tmp_off, shard_values_[shard_id].get());
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  rwlock_->UNLock();
+  return 0;
+}
+
 int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys,
                                              const float* values, size_t num) {
   rwlock_->RDLock();
diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h
index 98cbf2b4a21057f64a5d510158907df1de393925..50c295da53464c8cc1589b27a6dbc233367991b4 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/table/common_sparse_table.h
@@ -61,12 +61,17 @@ class CommonSparseTable : public SparseTable {
   int32_t save(const std::string& path, const std::string& param);
 
   virtual std::pair<int64_t, int64_t> print_table_stat();
-  virtual int32_t pull_sparse(float* pull_values, const uint64_t* keys,
-                              size_t num);
+  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
+
+  virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
+                                  size_t num);
 
   virtual int32_t push_sparse(const uint64_t* keys, const float* values,
                               size_t num);
 
+  virtual int32_t push_sparse(const uint64_t* keys, const float** values,
+                              size_t num);
+
   // only for sparse geo table
   virtual int32_t push_sparse_param(const uint64_t* keys, const float* values,
                                     size_t num);
@@ -81,6 +86,8 @@ class CommonSparseTable : public SparseTable {
  protected:
   virtual int32_t _push_sparse(const uint64_t* keys, const float* values,
                                size_t num);
+  virtual int32_t _push_sparse(const uint64_t* keys, const float** values,
+                               size_t num);
 
  private:
   const int task_pool_size_ = 11;
diff --git a/paddle/fluid/distributed/table/common_table.h b/paddle/fluid/distributed/table/common_table.h
index dc3cfa75ff689863773e88ef2d077b80c1f0a5d5..bc7f17f5f245794cebf96a8a4bc69e0dce8ac997 100644
--- a/paddle/fluid/distributed/table/common_table.h
+++ b/paddle/fluid/distributed/table/common_table.h
@@ -98,8 +98,8 @@ class DenseTable : public Table {
   virtual ~DenseTable() {}
 
   virtual void *get_shard(size_t shard_idx) { return 0; }
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
@@ -123,8 +123,8 @@ class BarrierTable : public Table {
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/table/depends/dense.h
index a2acdfd20148ac282f6633e55ea450dd3367e5f2..8079003d1bf8f677aeaba91f8860504adcb853e0 100644
--- a/paddle/fluid/distributed/table/depends/dense.h
+++ b/paddle/fluid/distributed/table/depends/dense.h
@@ -89,7 +89,6 @@ class DSGD : public DenseOptimizer {
 
     auto blas = GetBlas<float>();
     float lr = *(global_learning_rate_) * (*learning_rate);
-    VLOG(4) << "DSGD LearningRate: " << lr;
     blas.VCOPY(update_numel, update_values + begin, grads.data());
     blas.SCAL(update_numel, lr, grads.data());
     blas.VSUB(update_numel, param + begin, grads.data(), param + begin);
@@ -157,7 +156,6 @@ class DAdam : public DenseOptimizer {
     beta2_pow[0] = beta2_pow[0] * beta2;
 
     float lr_ = *(global_learning_rate_)*learning_rate[0];
-    VLOG(4) << "DAdam LearningRate: " << lr_;
     lr_ *= sqrt(1 - beta2_pow[0]) / (1 - beta1_pow[0]);
 
     float* tmp_ = tmp.data();
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index ba79a381a6d881fdc153ad0e04e0ee436120b179..5c10fca98cda4d6cbdcb430ab5f2b8016a6ff7f2 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -26,8 +26,10 @@
 #include <vector>
 #include "gflags/gflags.h"
 
+#include "butil/object_pool.h"
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/distributed/table/depends/initializers.h"
+#include "paddle/fluid/distributed/thirdparty/round_robin.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/rw_lock.h"
@@ -47,6 +49,10 @@ namespace distributed {
 
 enum Mode { training, infer };
 
+static const int SPARSE_SHARD_BUCKET_NUM_BITS = 6;
+static const size_t SPARSE_SHARD_BUCKET_NUM = (size_t)1
+                                              << SPARSE_SHARD_BUCKET_NUM_BITS;
+
 struct VALUE {
   explicit VALUE(size_t length)
       : length_(length),
@@ -66,11 +72,11 @@ struct VALUE {
   bool is_entry_;    // whether knock-in
 };
 
-inline bool count_entry(std::shared_ptr<VALUE> value, int threshold) {
+inline bool count_entry(VALUE *value, int threshold) {
   return value->count_ >= threshold;
 }
 
-inline bool probility_entry(std::shared_ptr<VALUE> value, float threshold) {
+inline bool probility_entry(VALUE *value, float threshold) {
   UniformInitializer uniform = UniformInitializer({"uniform", "0", "0", "1"});
   return uniform.GetValue() >= threshold;
 }
@@ -87,7 +93,7 @@ class ValueBlock {
         value_dims_(value_dims),
         value_offsets_(value_offsets),
         value_idx_(value_idx) {
-    for (int x = 0; x < value_dims.size(); ++x) {
+    for (size_t x = 0; x < value_dims.size(); ++x) {
       value_length_ += value_dims[x];
     }
 
@@ -96,13 +102,15 @@ class ValueBlock {
       auto slices = string::split_string<std::string>(entry_attr, ":");
       if (slices[0] == "none") {
         entry_func_ = std::bind(&count_entry, std::placeholders::_1, 0);
+        threshold_ = 0;
       } else if (slices[0] == "count_filter_entry") {
-        int threshold = std::stoi(slices[1]);
-        entry_func_ = std::bind(&count_entry, std::placeholders::_1, threshold);
+        threshold_ = std::stoi(slices[1]);
+        entry_func_ =
+            std::bind(&count_entry, std::placeholders::_1, threshold_);
       } else if (slices[0] == "probability_entry") {
-        float threshold = std::stof(slices[1]);
+        threshold_ = std::stof(slices[1]);
         entry_func_ =
-            std::bind(&probility_entry, std::placeholders::_1, threshold);
+            std::bind(&probility_entry, std::placeholders::_1, threshold_);
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Not supported Entry Type : %s, Only support [CountFilterEntry, "
@@ -143,7 +151,7 @@ class ValueBlock {
                            const std::vector<int> &value_dims) {
     auto pts = std::vector<float *>();
     pts.reserve(value_names.size());
-    auto &values = values_.at(id);
+    auto values = GetValue(id);
     for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
       PADDLE_ENFORCE_EQ(
           value_dims[i], value_dims_[i],
@@ -155,30 +163,59 @@ class ValueBlock {
   }
 
   // pull
-  float *Init(const uint64_t &id, const bool with_update = true) {
-    if (!Has(id)) {
-      values_[id] = std::make_shared<VALUE>(value_length_);
-    }
+  float *Init(const uint64_t &id, const bool with_update = true,
+              const int counter = 1) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
 
-    auto &value = values_.at(id);
+    auto &table = values_[bucket];
+    auto res = table.find(id);
 
-    if (with_update) {
-      AttrUpdate(value);
+    VALUE *value = nullptr;
+    if (res == table.end()) {
+      value = butil::get_object<VALUE>(value_length_);
+
+      table[id] = value;
+
+    } else {
+      value = res->second;
     }
 
+    if (with_update) {
+      AttrUpdate(value, counter);
+    }
     return value->data_.data();
   }
 
-  void AttrUpdate(std::shared_ptr<VALUE> value) {
+  VALUE *InitGet(const uint64_t &id, const bool with_update = true,
+                 const int counter = 1) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+
+    auto &table = values_[bucket];
+    auto res = table.find(id);
+
+    VALUE *value = nullptr;
+    if (res == table.end()) {
+      value = butil::get_object<VALUE>(value_length_);
+      // value = _alloc.acquire(value_length_);
+      table[id] = value;
+    } else {
+      value = (VALUE *)(void *)(res->second);
+    }
+    return value;
+  }
+
+  void AttrUpdate(VALUE *value, const int counter) {
     // update state
     value->unseen_days_ = 0;
-    ++value->count_;
+    value->count_ += counter;
 
     if (!value->is_entry_) {
       value->is_entry_ = entry_func_(value);
       if (value->is_entry_) {
         // initialize
-        for (int x = 0; x < value_names_.size(); ++x) {
+        for (size_t x = 0; x < value_names_.size(); ++x) {
           initializers_[x]->GetValue(value->data_.data() + value_offsets_[x],
                                      value_dims_[x]);
         }
@@ -193,40 +230,73 @@ class ValueBlock {
 
   // dont jude if (has(id))
   float *Get(const uint64_t &id) {
-    auto &value = values_.at(id);
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+
+    // auto &value = table.at(id);
+    // return value->data_.data();
+    auto res = table.find(id);
+    VALUE *value = res->second;
     return value->data_.data();
   }
 
   // for load, to reset count, unseen_days
-  std::shared_ptr<VALUE> GetValue(const uint64_t &id) { return values_.at(id); }
+  VALUE *GetValue(const uint64_t &id) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+
+    auto &table = values_[bucket];
+    auto res = table.find(id);
+    return res->second;
+  }
 
   bool GetEntry(const uint64_t &id) {
-    auto &value = values_.at(id);
+    auto value = GetValue(id);
     return value->is_entry_;
   }
 
   void SetEntry(const uint64_t &id, const bool state) {
-    auto &value = values_.at(id);
+    auto value = GetValue(id);
     value->is_entry_ = state;
   }
 
   void Shrink(const int threshold) {
-    for (auto iter = values_.begin(); iter != values_.end();) {
-      auto &value = iter->second;
-      value->unseen_days_++;
-      if (value->unseen_days_ >= threshold) {
-        iter = values_.erase(iter);
-      } else {
-        ++iter;
+    for (auto &table : values_) {
+      for (auto iter = table.begin(); iter != table.end();) {
+        // VALUE* value = (VALUE*)(void*)(iter->second);
+        VALUE *value = iter->second;
+        value->unseen_days_++;
+        if (value->unseen_days_ >= threshold) {
+          butil::return_object(iter->second);
+          //_alloc.release(iter->second);
+          //_alloc.release(value);
+          iter = table.erase(iter);
+        } else {
+          ++iter;
+        }
       }
     }
     return;
   }
 
+  float GetThreshold() { return threshold_; }
+  size_t compute_bucket(size_t hash) {
+    if (SPARSE_SHARD_BUCKET_NUM == 1) {
+      return 0;
+    } else {
+      return hash >> (sizeof(size_t) * 8 - SPARSE_SHARD_BUCKET_NUM_BITS);
+    }
+  }
+
  private:
   bool Has(const uint64_t id) {
-    auto got = values_.find(id);
-    if (got == values_.end()) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+
+    auto got = table.find(id);
+    if (got == table.end()) {
       return false;
     } else {
       return true;
@@ -234,8 +304,9 @@ class ValueBlock {
   }
 
  public:
-  std::unordered_map<uint64_t, std::shared_ptr<VALUE>> values_;
+  robin_hood::unordered_map<uint64_t, VALUE *> values_[SPARSE_SHARD_BUCKET_NUM];
   size_t value_length_ = 0;
+  std::hash<uint64_t> _hasher;
 
  private:
   const std::vector<std::string> &value_names_;
@@ -243,8 +314,9 @@ class ValueBlock {
   const std::vector<int> &value_offsets_;
   const std::unordered_map<std::string, int> &value_idx_;
 
-  std::function<bool(std::shared_ptr<VALUE>)> entry_func_;
+  std::function<bool(VALUE *)> entry_func_;
   std::vector<std::shared_ptr<Initializer>> initializers_;
+  float threshold_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/table/depends/sparse.h b/paddle/fluid/distributed/table/depends/sparse.h
index 672d6e7d396874b5cc5a296f15e3842a3233410b..0e1d7ef03c129c2dc6f72d6e56fafb143d879bd4 100644
--- a/paddle/fluid/distributed/table/depends/sparse.h
+++ b/paddle/fluid/distributed/table/depends/sparse.h
@@ -110,7 +110,6 @@ class SSGD : public SparseOptimizer {
       auto* value = block->Get(id);
 
       float learning_rate = *(global_learning_rate_) * (value + lr_offset)[0];
-      VLOG(4) << "SSGD LearningRate: " << learning_rate;
       float* param = value + param_offset;
 
       std::vector<float> grads;
@@ -166,7 +165,6 @@ class SAdam : public SparseOptimizer {
       if (!block->GetEntry(id)) continue;
       auto* values = block->Get(id);
       float lr_ = *(global_learning_rate_) * (values + lr_offset)[0];
-      VLOG(4) << "SAdam LearningRate: " << lr_;
       float* param = values + param_offset;
       float* moment1 = values + m1_offset;
       float* moment2 = values + m2_offset;
diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/table/depends/sparse_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c185dd17d792e4715ae884e66c412aa5f24f809f
--- /dev/null
+++ b/paddle/fluid/distributed/table/depends/sparse_utils.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace distributed {
+
+struct PullSparseValue {
+  explicit PullSparseValue(int numel, int dim)
+      : numel_(numel),
+        dim_(dim),
+        is_training_(true),
+        feasigns_(nullptr),
+        frequencies_(nullptr) {}
+
+  explicit PullSparseValue(std::vector<uint64_t> feasigns,
+                           std::vector<uint32_t> frequencies, int dim) {
+    numel_ = feasigns.size();
+    dim_ = dim;
+    is_training_ = true;
+    feasigns_ = feasigns.data();
+    frequencies_ = frequencies.data();
+  }
+
+  void DeserializeFromBytes(void* bytes) {
+    /*
+    |---isTraining--------------|
+    |---8*{num}B(keysData)------|
+    |---4*{num}B(Frequencies)---|
+    */
+    auto* begin = reinterpret_cast<char*>(bytes);
+    is_training_ = reinterpret_cast<bool*>(begin)[0];
+    feasigns_ = reinterpret_cast<uint64_t*>(begin + sizeof(bool));
+    frequencies_ = reinterpret_cast<uint32_t*>(begin + sizeof(bool) +
+                                               sizeof(uint64_t) * numel_);
+  }
+
+  void Fission(const int shard_id, const int shard_num,
+               std::vector<int>* offset_shard) const {
+    offset_shard->reserve(numel_ / shard_num + 1);
+    for (int x = 0; x < numel_; ++x) {
+      if (feasigns_[x] % shard_num == shard_id) {
+        offset_shard->push_back(x);
+      }
+    }
+  }
+
+  int numel_;
+  int dim_;
+  bool is_training_;
+  uint64_t* feasigns_;
+  uint32_t* frequencies_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/table/graph/graph_edge.cc b/paddle/fluid/distributed/table/graph/graph_edge.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ab0d5a76d6715401dd55ce7487634b72d452ddf
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_edge.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph/graph_edge.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+}
+
+void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+  weight_arr.push_back(weight);
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_edge.h b/paddle/fluid/distributed/table/graph/graph_edge.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dfe5a6f357a7cd7d79834a20b6411995665f4fa
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_edge.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+namespace paddle {
+namespace distributed {
+
+class GraphEdgeBlob {
+ public:
+  GraphEdgeBlob() {}
+  virtual ~GraphEdgeBlob() {}
+  size_t size() { return id_arr.size(); }
+  virtual void add_edge(uint64_t id, float weight);
+  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual float get_weight(int idx) { return 1; }
+
+ protected:
+  std::vector<uint64_t> id_arr;
+};
+
+class WeightedGraphEdgeBlob : public GraphEdgeBlob {
+ public:
+  WeightedGraphEdgeBlob() {}
+  virtual ~WeightedGraphEdgeBlob() {}
+  virtual void add_edge(uint64_t id, float weight);
+  virtual float get_weight(int idx) { return weight_arr[idx]; }
+
+ protected:
+  std::vector<float> weight_arr;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc
new file mode 100644
index 0000000000000000000000000000000000000000..816d31b979072c3f1679df1ea75cd9dc75c55b0a
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_node.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+GraphNode::~GraphNode() {
+  if (sampler != nullptr) {
+    delete sampler;
+    sampler = nullptr;
+  }
+  if (edges != nullptr) {
+    delete edges;
+    edges = nullptr;
+  }
+}
+
+int Node::weight_size = sizeof(float);
+int Node::id_size = sizeof(uint64_t);
+int Node::int_size = sizeof(int);
+
+int Node::get_size(bool need_feature) { return id_size + int_size; }
+
+void Node::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  memcpy(buffer, &feat_num, sizeof(int));
+}
+
+void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); }
+
+int FeatureNode::get_size(bool need_feature) {
+  int size = id_size + int_size;  // id, feat_num
+  if (need_feature) {
+    size += feature.size() * int_size;
+    for (const std::string& fea : feature) {
+      size += fea.size();
+    }
+  }
+  return size;
+}
+
+void GraphNode::build_edges(bool is_weighted) {
+  if (edges == nullptr) {
+    if (is_weighted == true) {
+      edges = new WeightedGraphEdgeBlob();
+    } else {
+      edges = new GraphEdgeBlob();
+    }
+  }
+}
+void GraphNode::build_sampler(std::string sample_type) {
+  if (sample_type == "random") {
+    sampler = new RandomSampler();
+  } else if (sample_type == "weighted") {
+    sampler = new WeightedSampler();
+  }
+  sampler->build(edges);
+}
+void FeatureNode::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  int feat_len;
+  if (need_feature) {
+    feat_num += feature.size();
+    memcpy(buffer, &feat_num, sizeof(int));
+    buffer += sizeof(int);
+    for (int i = 0; i < feat_num; ++i) {
+      feat_len = feature[i].size();
+      memcpy(buffer, &feat_len, sizeof(int));
+      buffer += sizeof(int);
+      memcpy(buffer, feature[i].c_str(), feature[i].size());
+      buffer += feature[i].size();
+    }
+  } else {
+    memcpy(buffer, &feat_num, sizeof(int));
+  }
+}
+void FeatureNode::recover_from_buffer(char* buffer) {
+  int feat_num, feat_len;
+  memcpy(&id, buffer, id_size);
+  buffer += id_size;
+
+  memcpy(&feat_num, buffer, sizeof(int));
+  buffer += sizeof(int);
+
+  feature.clear();
+  for (int i = 0; i < feat_num; ++i) {
+    memcpy(&feat_len, buffer, sizeof(int));
+    buffer += sizeof(int);
+
+    char str[feat_len + 1];
+    memcpy(str, buffer, feat_len);
+    buffer += feat_len;
+    str[feat_len] = '\0';
+    feature.push_back(std::string(str));
+  }
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_node.h b/paddle/fluid/distributed/table/graph/graph_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ad795ac97b5499c7b10361760f7ac16494c154b
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_node.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h"
+namespace paddle {
+namespace distributed {
+
+class Node {
+ public:
+  Node() {}
+  Node(uint64_t id) : id(id) {}
+  virtual ~Node() {}
+  static int id_size, int_size, weight_size;
+  uint64_t get_id() { return id; }
+  void set_id(uint64_t id) { this->id = id; }
+
+  virtual void build_edges(bool is_weighted) {}
+  virtual void build_sampler(std::string sample_type) {}
+  virtual void add_edge(uint64_t id, float weight) {}
+  virtual std::vector<int> sample_k(int k) { return std::vector<int>(); }
+  virtual uint64_t get_neighbor_id(int idx) { return 0; }
+  virtual float get_neighbor_weight(int idx) { return 1.; }
+
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) { return std::string(""); }
+  virtual void set_feature(int idx, std::string str) {}
+  virtual void set_feature_size(int size) {}
+  virtual int get_feature_size() { return 0; }
+
+ protected:
+  uint64_t id;
+};
+
+class GraphNode : public Node {
+ public:
+  GraphNode() : Node(), sampler(nullptr), edges(nullptr) {}
+  GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {}
+  virtual ~GraphNode();
+  virtual void build_edges(bool is_weighted);
+  virtual void build_sampler(std::string sample_type);
+  virtual void add_edge(uint64_t id, float weight) {
+    edges->add_edge(id, weight);
+  }
+  virtual std::vector<int> sample_k(int k) { return sampler->sample_k(k); }
+  virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
+  virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+
+ protected:
+  Sampler *sampler;
+  GraphEdgeBlob *edges;
+};
+
+class FeatureNode : public Node {
+ public:
+  FeatureNode() : Node() {}
+  FeatureNode(uint64_t id) : Node(id) {}
+  virtual ~FeatureNode() {}
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) {
+    if (idx < (int)this->feature.size()) {
+      return this->feature[idx];
+    } else {
+      return std::string("");
+    }
+  }
+
+  virtual void set_feature(int idx, std::string str) {
+    if (idx >= (int)this->feature.size()) {
+      this->feature.resize(idx + 1);
+    }
+    this->feature[idx] = str;
+  }
+  virtual void set_feature_size(int size) { this->feature.resize(size); }
+  virtual int get_feature_size() { return this->feature.size(); }
+
+  template <typename T>
+  static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
+    T v;
+    size_t Tsize = sizeof(T) * feat_str.size();
+    char buffer[Tsize];
+    for (size_t i = 0; i < feat_str.size(); i++) {
+      std::stringstream ss(feat_str[i]);
+      ss >> v;
+      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
+    }
+    return std::string(buffer, Tsize);
+  }
+
+  template <typename T>
+  static std::vector<T> parse_bytes_to_array(std::string feat_str) {
+    T v;
+    std::vector<T> out;
+    size_t start = 0;
+    const char *buffer = feat_str.data();
+    while (start < feat_str.size()) {
+      std::memcpy((char *)&v, buffer + start, sizeof(T));
+      start += sizeof(T);
+      out.push_back(v);
+    }
+    return out;
+  }
+
+ protected:
+  std::vector<std::string> feature;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3a680875e3df4a9cd60f8fe1921b877dbb23c8a2
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h"
+#include <iostream>
+#include <unordered_map>
+namespace paddle {
+namespace distributed {
+
+void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
+
+std::vector<int> RandomSampler::sample_k(int k) {
+  int n = edges->size();
+  if (k > n) {
+    k = n;
+  }
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  std::vector<int> sample_result;
+  std::unordered_map<int, int> replace_map;
+  while (k--) {
+    int rand_int = rand() % n;
+    auto iter = replace_map.find(rand_int);
+    if (iter == replace_map.end()) {
+      sample_result.push_back(rand_int);
+    } else {
+      sample_result.push_back(iter->second);
+    }
+
+    iter = replace_map.find(n - 1);
+    if (iter == replace_map.end()) {
+      replace_map[rand_int] = n - 1;
+    } else {
+      replace_map[rand_int] = iter->second;
+    }
+    --n;
+  }
+  return sample_result;
+}
+
+WeightedSampler::WeightedSampler() {
+  left = nullptr;
+  right = nullptr;
+  edges = nullptr;
+}
+
+WeightedSampler::~WeightedSampler() {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+}
+
+void WeightedSampler::build(GraphEdgeBlob *edges) {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+  return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size());
+}
+
+void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start,
+                                int end) {
+  count = 0;
+  this->edges = edges;
+  if (start + 1 == end) {
+    left = right = nullptr;
+    idx = start;
+    count = 1;
+    weight = edges->get_weight(idx);
+
+  } else {
+    left = new WeightedSampler();
+    right = new WeightedSampler();
+    left->build_one(edges, start, start + (end - start) / 2);
+    right->build_one(edges, start + (end - start) / 2, end);
+    weight = left->weight + right->weight;
+    count = left->count + right->count;
+  }
+}
+std::vector<int> WeightedSampler::sample_k(int k) {
+  if (k > count) {
+    k = count;
+  }
+  std::vector<int> sample_result;
+  float subtract;
+  std::unordered_map<WeightedSampler *, float> subtract_weight_map;
+  std::unordered_map<WeightedSampler *, int> subtract_count_map;
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  while (k--) {
+    float query_weight = rand() % 100000 / 100000.0;
+    query_weight *= weight - subtract_weight_map[this];
+    sample_result.push_back(sample(query_weight, subtract_weight_map,
+                                   subtract_count_map, subtract));
+  }
+  return sample_result;
+}
+
+int WeightedSampler::sample(
+    float query_weight,
+    std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+    std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+    float &subtract) {
+  if (left == nullptr) {
+    subtract_weight_map[this] = weight;
+    subtract = weight;
+    subtract_count_map[this] = 1;
+    return idx;
+  }
+  int left_count = left->count - subtract_count_map[left];
+  int right_count = right->count - subtract_count_map[right];
+  float left_subtract = subtract_weight_map[left];
+  int return_idx;
+  if (right_count == 0 ||
+      left_count > 0 && left->weight - left_subtract >= query_weight) {
+    return_idx = left->sample(query_weight, subtract_weight_map,
+                              subtract_count_map, subtract);
+  } else {
+    return_idx =
+        right->sample(query_weight - (left->weight - left_subtract),
+                      subtract_weight_map, subtract_count_map, subtract);
+  }
+  subtract_weight_map[this] += subtract;
+  subtract_count_map[this]++;
+  return return_idx;
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..1787ab23b04316de9ad0622ff5524bc88bd51fe1
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ctime>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph/graph_edge.h"
+namespace paddle {
+namespace distributed {
+
+class Sampler {
+ public:
+  virtual ~Sampler() {}
+  virtual void build(GraphEdgeBlob *edges) = 0;
+  virtual std::vector<int> sample_k(int k) = 0;
+};
+
+class RandomSampler : public Sampler {
+ public:
+  virtual ~RandomSampler() {}
+  virtual void build(GraphEdgeBlob *edges);
+  virtual std::vector<int> sample_k(int k);
+  GraphEdgeBlob *edges;
+};
+
+class WeightedSampler : public Sampler {
+ public:
+  WeightedSampler();
+  virtual ~WeightedSampler();
+  WeightedSampler *left, *right;
+  float weight;
+  int count;
+  int idx;
+  GraphEdgeBlob *edges;
+  virtual void build(GraphEdgeBlob *edges);
+  virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end);
+  virtual std::vector<int> sample_k(int k);
+
+ private:
+  int sample(float query_weight,
+             std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+             std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+             float &subtract);
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_edge.cc b/paddle/fluid/distributed/table/graph_edge.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cc90f4c6516c1873b078b96c550d0d52ac5d3b9c
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_edge.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph_edge.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+}
+
+void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+  weight_arr.push_back(weight);
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_edge.h b/paddle/fluid/distributed/table/graph_edge.h
new file mode 100644
index 0000000000000000000000000000000000000000..3dfe5a6f357a7cd7d79834a20b6411995665f4fa
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_edge.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+namespace paddle {
+namespace distributed {
+
+class GraphEdgeBlob {
+ public:
+  GraphEdgeBlob() {}
+  virtual ~GraphEdgeBlob() {}
+  size_t size() { return id_arr.size(); }
+  virtual void add_edge(uint64_t id, float weight);
+  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual float get_weight(int idx) { return 1; }
+
+ protected:
+  std::vector<uint64_t> id_arr;
+};
+
+class WeightedGraphEdgeBlob : public GraphEdgeBlob {
+ public:
+  WeightedGraphEdgeBlob() {}
+  virtual ~WeightedGraphEdgeBlob() {}
+  virtual void add_edge(uint64_t id, float weight);
+  virtual float get_weight(int idx) { return weight_arr[idx]; }
+
+ protected:
+  std::vector<float> weight_arr;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27a2cafaf4f0fec95de818204ebd191a5083e50a
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_node.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph_node.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+GraphNode::~GraphNode() {
+  if (sampler != nullptr) {
+    delete sampler;
+    sampler = nullptr;
+  }
+  if (edges != nullptr) {
+    delete edges;
+    edges = nullptr;
+  }
+}
+
+int Node::weight_size = sizeof(float);
+int Node::id_size = sizeof(uint64_t);
+int Node::int_size = sizeof(int);
+
+int Node::get_size(bool need_feature) { return id_size + int_size; }
+
+void Node::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  memcpy(buffer, &feat_num, sizeof(int));
+}
+
+void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); }
+
+int FeatureNode::get_size(bool need_feature) {
+  int size = id_size + int_size;  // id, feat_num
+  if (need_feature) {
+    size += feature.size() * int_size;
+    for (const std::string& fea : feature) {
+      size += fea.size();
+    }
+  }
+  return size;
+}
+
+void GraphNode::build_edges(bool is_weighted) {
+  if (edges == nullptr) {
+    if (is_weighted == true) {
+      edges = new WeightedGraphEdgeBlob();
+    } else {
+      edges = new GraphEdgeBlob();
+    }
+  }
+}
+void GraphNode::build_sampler(std::string sample_type) {
+  if (sample_type == "random") {
+    sampler = new RandomSampler();
+  } else if (sample_type == "weighted") {
+    sampler = new WeightedSampler();
+  }
+  sampler->build(edges);
+}
+void FeatureNode::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  int feat_len;
+  if (need_feature) {
+    feat_num += feature.size();
+    memcpy(buffer, &feat_num, sizeof(int));
+    buffer += sizeof(int);
+    for (int i = 0; i < feat_num; ++i) {
+      feat_len = feature[i].size();
+      memcpy(buffer, &feat_len, sizeof(int));
+      buffer += sizeof(int);
+      memcpy(buffer, feature[i].c_str(), feature[i].size());
+      buffer += feature[i].size();
+    }
+  } else {
+    memcpy(buffer, &feat_num, sizeof(int));
+  }
+}
+void FeatureNode::recover_from_buffer(char* buffer) {
+  int feat_num, feat_len;
+  memcpy(&id, buffer, id_size);
+  buffer += id_size;
+
+  memcpy(&feat_num, buffer, sizeof(int));
+  buffer += sizeof(int);
+
+  feature.clear();
+  for (int i = 0; i < feat_num; ++i) {
+    memcpy(&feat_len, buffer, sizeof(int));
+    buffer += sizeof(int);
+
+    char str[feat_len + 1];
+    memcpy(str, buffer, feat_len);
+    buffer += feat_len;
+    str[feat_len] = '\0';
+    feature.push_back(std::string(str));
+  }
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3e8e3ce5b50d06945857ded1db168f84f955c5f
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_node.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
+namespace paddle {
+namespace distributed {
+
+class Node {
+ public:
+  Node() {}
+  Node(uint64_t id) : id(id) {}
+  virtual ~Node() {}
+  static int id_size, int_size, weight_size;
+  uint64_t get_id() { return id; }
+  void set_id(uint64_t id) { this->id = id; }
+
+  virtual void build_edges(bool is_weighted) {}
+  virtual void build_sampler(std::string sample_type) {}
+  virtual void add_edge(uint64_t id, float weight) {}
+  virtual std::vector<int> sample_k(int k) { return std::vector<int>(); }
+  virtual uint64_t get_neighbor_id(int idx) { return 0; }
+  virtual float get_neighbor_weight(int idx) { return 1.; }
+
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) { return std::string(""); }
+  virtual void set_feature(int idx, std::string str) {}
+  virtual void set_feature_size(int size) {}
+  virtual int get_feature_size() { return 0; }
+
+ protected:
+  uint64_t id;
+};
+
+class GraphNode : public Node {
+ public:
+  GraphNode() : Node(), sampler(nullptr), edges(nullptr) {}
+  GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {}
+  virtual ~GraphNode();
+  virtual void build_edges(bool is_weighted);
+  virtual void build_sampler(std::string sample_type);
+  virtual void add_edge(uint64_t id, float weight) {
+    edges->add_edge(id, weight);
+  }
+  virtual std::vector<int> sample_k(int k) { return sampler->sample_k(k); }
+  virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
+  virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+
+ protected:
+  Sampler *sampler;
+  GraphEdgeBlob *edges;
+};
+
+class FeatureNode : public Node {
+ public:
+  FeatureNode() : Node() {}
+  FeatureNode(uint64_t id) : Node(id) {}
+  virtual ~FeatureNode() {}
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) {
+    if (idx < (int)this->feature.size()) {
+      return this->feature[idx];
+    } else {
+      return std::string("");
+    }
+  }
+
+  virtual void set_feature(int idx, std::string str) {
+    if (idx >= (int)this->feature.size()) {
+      this->feature.resize(idx + 1);
+    }
+    this->feature[idx] = str;
+  }
+  virtual void set_feature_size(int size) { this->feature.resize(size); }
+  virtual int get_feature_size() { return this->feature.size(); }
+
+  template <typename T>
+  static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
+    T v;
+    size_t Tsize = sizeof(T) * feat_str.size();
+    char buffer[Tsize];
+    for (size_t i = 0; i < feat_str.size(); i++) {
+      std::stringstream ss(feat_str[i]);
+      ss >> v;
+      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
+    }
+    return std::string(buffer, Tsize);
+  }
+
+  template <typename T>
+  static std::vector<T> parse_bytes_to_array(std::string feat_str) {
+    T v;
+    std::vector<T> out;
+    size_t start = 0;
+    const char *buffer = feat_str.data();
+    while (start < feat_str.size()) {
+      std::memcpy((char *)&v, buffer + start, sizeof(T));
+      start += sizeof(T);
+      out.push_back(v);
+    }
+    return out;
+  }
+
+ protected:
+  std::vector<std::string> feature;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph_weighted_sampler.cc
new file mode 100644
index 0000000000000000000000000000000000000000..059a1d64bc392d7ef6936c008bbeec3bef3a5fb9
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_weighted_sampler.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
+#include <iostream>
+#include <unordered_map>
+namespace paddle {
+namespace distributed {
+
+void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
+
+std::vector<int> RandomSampler::sample_k(int k) {
+  int n = edges->size();
+  if (k > n) {
+    k = n;
+  }
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  std::vector<int> sample_result;
+  std::unordered_map<int, int> replace_map;
+  while (k--) {
+    int rand_int = rand() % n;
+    auto iter = replace_map.find(rand_int);
+    if (iter == replace_map.end()) {
+      sample_result.push_back(rand_int);
+    } else {
+      sample_result.push_back(iter->second);
+    }
+
+    iter = replace_map.find(n - 1);
+    if (iter == replace_map.end()) {
+      replace_map[rand_int] = n - 1;
+    } else {
+      replace_map[rand_int] = iter->second;
+    }
+    --n;
+  }
+  return sample_result;
+}
+
+WeightedSampler::WeightedSampler() {
+  left = nullptr;
+  right = nullptr;
+  edges = nullptr;
+}
+
+WeightedSampler::~WeightedSampler() {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+}
+
+void WeightedSampler::build(GraphEdgeBlob *edges) {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+  return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size());
+}
+
+void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start,
+                                int end) {
+  count = 0;
+  this->edges = edges;
+  if (start + 1 == end) {
+    left = right = nullptr;
+    idx = start;
+    count = 1;
+    weight = edges->get_weight(idx);
+
+  } else {
+    left = new WeightedSampler();
+    right = new WeightedSampler();
+    left->build_one(edges, start, start + (end - start) / 2);
+    right->build_one(edges, start + (end - start) / 2, end);
+    weight = left->weight + right->weight;
+    count = left->count + right->count;
+  }
+}
+std::vector<int> WeightedSampler::sample_k(int k) {
+  if (k > count) {
+    k = count;
+  }
+  std::vector<int> sample_result;
+  float subtract;
+  std::unordered_map<WeightedSampler *, float> subtract_weight_map;
+  std::unordered_map<WeightedSampler *, int> subtract_count_map;
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  while (k--) {
+    float query_weight = rand() % 100000 / 100000.0;
+    query_weight *= weight - subtract_weight_map[this];
+    sample_result.push_back(sample(query_weight, subtract_weight_map,
+                                   subtract_count_map, subtract));
+  }
+  return sample_result;
+}
+
+int WeightedSampler::sample(
+    float query_weight,
+    std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+    std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+    float &subtract) {
+  if (left == nullptr) {
+    subtract_weight_map[this] = weight;
+    subtract = weight;
+    subtract_count_map[this] = 1;
+    return idx;
+  }
+  int left_count = left->count - subtract_count_map[left];
+  int right_count = right->count - subtract_count_map[right];
+  float left_subtract = subtract_weight_map[left];
+  int return_idx;
+  if (right_count == 0 ||
+      left_count > 0 && left->weight - left_subtract >= query_weight) {
+    return_idx = left->sample(query_weight, subtract_weight_map,
+                              subtract_count_map, subtract);
+  } else {
+    return_idx =
+        right->sample(query_weight - (left->weight - left_subtract),
+                      subtract_weight_map, subtract_count_map, subtract);
+  }
+  subtract_weight_map[this] += subtract;
+  subtract_count_map[this]++;
+  return return_idx;
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph_weighted_sampler.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfc341d27c6b766fcee57e8973a4353d4fe93b4e
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_weighted_sampler.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ctime>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph_edge.h"
+namespace paddle {
+namespace distributed {
+
+class Sampler {
+ public:
+  virtual ~Sampler() {}
+  virtual void build(GraphEdgeBlob *edges) = 0;
+  virtual std::vector<int> sample_k(int k) = 0;
+};
+
+class RandomSampler : public Sampler {
+ public:
+  virtual ~RandomSampler() {}
+  virtual void build(GraphEdgeBlob *edges);
+  virtual std::vector<int> sample_k(int k);
+  GraphEdgeBlob *edges;
+};
+
+class WeightedSampler : public Sampler {
+ public:
+  WeightedSampler();
+  virtual ~WeightedSampler();
+  WeightedSampler *left, *right;
+  float weight;
+  int count;
+  int idx;
+  GraphEdgeBlob *edges;
+  virtual void build(GraphEdgeBlob *edges);
+  virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end);
+  virtual std::vector<int> sample_k(int k);
+
+ private:
+  int sample(float query_weight,
+             std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+             std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+             float &subtract);
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/sparse_geo_table.cc b/paddle/fluid/distributed/table/sparse_geo_table.cc
index 9b276e7de5c92d495f9d40535033b0a82186bc82..04cd1136382a4e24eb0a6d196ec01ad68ed56309 100644
--- a/paddle/fluid/distributed/table/sparse_geo_table.cc
+++ b/paddle/fluid/distributed/table/sparse_geo_table.cc
@@ -22,8 +22,17 @@ int32_t SparseGeoTable::pull_geo_param(const uint32_t trainer_id,
                                        std::vector<uint64_t>* ids) {
   geo_recorder->GetAndClear(trainer_id, ids);
   auto dim = _config.common().dims()[0];
+
+  std::vector<uint32_t> frequencies;
+  frequencies.resize(ids->size(), 1);
+
+  auto pull_value = PullSparseValue(ids->size(), dim);
+  pull_value.is_training_ = true;
+  pull_value.feasigns_ = ids->data();
+  pull_value.frequencies_ = frequencies.data();
+
   values->resize(ids->size() * dim);
-  CommonSparseTable::pull_sparse(values->data(), ids->data(), ids->size());
+  CommonSparseTable::pull_sparse(values->data(), pull_value);
   return 0;
 }
 
diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/table/table.cc
index dfaaa6ffc12c2b363de5f26df01ab4b8db9f0153..600be954cb59663fff6f867c020248a92e81a151 100644
--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/distributed/common/registerer.h"
 
 #include "paddle/fluid/distributed/table/common_dense_table.h"
+#include "paddle/fluid/distributed/table/common_graph_table.h"
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
 #include "paddle/fluid/distributed/table/tensor_accessor.h"
@@ -25,7 +26,7 @@
 
 namespace paddle {
 namespace distributed {
-
+REGISTER_PSCORE_CLASS(Table, GraphTable);
 REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
@@ -75,5 +76,6 @@ int32_t Table::initialize_accessor() {
   _value_accesor.reset(accessor);
   return 0;
 }
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h
index 65c99d2bbd40d4567f49eb84bd84173a0a3fee0b..81a1ff5eced2bb36b8f917a31de1e214b272bfa3 100644
--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/table/table.h
@@ -21,6 +21,8 @@
 #include <string>
 #include <utility>
 #include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -46,10 +48,17 @@ class Table {
     return 0;
   }
 
-  virtual int32_t pull_sparse(float *values, const uint64_t *keys,
-                              size_t num) = 0;
+  virtual int32_t pull_sparse_ptr(char **pull_values, const uint64_t *keys,
+                                  size_t num) {
+    VLOG(0) << "NOT IMPLEMENT";
+    return 0;
+  }
+  virtual int32_t pull_sparse(float *values,
+                              const PullSparseValue &pull_value) = 0;
   virtual int32_t push_sparse(const uint64_t *keys, const float *values,
                               size_t num) = 0;
+  virtual int32_t push_sparse(const uint64_t *keys, const float **values,
+                              size_t num){};
   virtual int32_t push_sparse_param(const uint64_t *keys, const float *values,
                                     size_t num) {
     return 0;
@@ -141,5 +150,6 @@ class TableManager {
   TableManager() {}
   ~TableManager() {}
 };
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/table/tensor_table.h b/paddle/fluid/distributed/table/tensor_table.h
index 1a8f1a9cd9adb841c3ed1fcf849a3a293c47cc52..080682d131420b5b57ce470b6b570fe24a1925b3 100644
--- a/paddle/fluid/distributed/table/tensor_table.h
+++ b/paddle/fluid/distributed/table/tensor_table.h
@@ -52,8 +52,8 @@ class TensorTable : public Table {
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
@@ -102,8 +102,8 @@ class DenseTensorTable : public TensorTable {
   DenseTensorTable() {}
   virtual ~DenseTensorTable() {}
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
@@ -158,8 +158,8 @@ class GlobalStepTable : public DenseTensorTable {
   GlobalStepTable() {}
   virtual ~GlobalStepTable() {}
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index adedd049023daa053aebcff60dd245408f4901bd..af87e1b6cc61d190cf06b601f05455d8ac976d71 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -1,8 +1,10 @@
 set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
+cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor
+ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
 
 set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
+cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table
+tensor_accessor ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
 
 set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
@@ -15,3 +17,6 @@ cc_test(brpc_service_sparse_sgd_test SRCS brpc_service_sparse_sgd_test.cc DEPS s
 
 set_source_files_properties(brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS})
+
+set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index fbd236012f523715451e9c21d3f2028f88d573f3..8fb3434af6e281762b762bbc8d01b372e5c0ee34 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -212,8 +212,8 @@ void RunBrpcPushSparse() {
 
   /*-----------------------Test Server Init----------------------------------*/
   LOG(INFO) << "Run pull_sparse_param";
-  auto pull_status = worker_ptr_->pull_sparse(fea_value_ptr.data(), 0,
-                                              fea_keys.data(), fea_keys.size());
+  auto pull_status = worker_ptr_->pull_sparse(
+      fea_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_status.wait();
   for (size_t idx = 0; idx < tensor->numel(); ++idx) {
     fea_values.data()[idx] *= 2.0;
@@ -241,7 +241,7 @@ void RunBrpcPushSparse() {
   push_status.wait();
 
   auto pull_param_status = worker_ptr_->pull_sparse(
-      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size());
+      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_param_status.wait();
 
   for (size_t idx = 0; idx < tensor->numel(); ++idx) {
@@ -275,7 +275,7 @@ void RunBrpcPushSparse() {
   push_grad_status.wait();
 
   auto pull_update_status = worker_ptr_->pull_sparse(
-      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size());
+      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_update_status.wait();
 
   for (size_t idx = 0; idx < tensor->numel(); ++idx) {
diff --git a/paddle/fluid/distributed/test/geo_table_test.cc b/paddle/fluid/distributed/test/geo_table_test.cc
index 22e11acf6584eefa3e41ccd950feb2dfb4bf3720..c9f15db3f788e13ca2f9a8279358358f1c50131b 100644
--- a/paddle/fluid/distributed/test/geo_table_test.cc
+++ b/paddle/fluid/distributed/test/geo_table_test.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/table/common_dense_table.h"
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
 #include "paddle/fluid/distributed/table/table.h"
 
@@ -53,14 +54,18 @@ TEST(SparseGeoTable, SSUM) {
 
   // test push_sparse_param, and create params
   std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
   std::vector<float> init_values;
   for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
     init_values.push_back(0.0);
   }
   table->push_sparse_param(init_keys.data(), init_values.data(),
                            init_keys.size());
+
   std::vector<float> pull_values(init_values.size());
-  table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(pull_values.data(), value);
+
   for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
     ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5);
   }
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b268bb449e14619048e89c8933dbae7daf66537b
--- /dev/null
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -0,0 +1,556 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+void testSampleNodes(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  std::vector<uint64_t> ids;
+  auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids);
+  std::unordered_set<uint64_t> s;
+  std::unordered_set<uint64_t> s1 = {37, 59};
+  pull_status.wait();
+  for (auto id : ids) s.insert(id);
+  ASSERT_EQ(true, s.size() == s1.size());
+  for (auto id : s) {
+    ASSERT_EQ(true, s1.find(id) != s1.end());
+  }
+}
+
+void testFeatureNodeSerializeInt() {
+  std::string out =
+      distributed::FeatureNode::parse_value_to_bytes<int32_t>({"123", "345"});
+  std::vector<int32_t> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<int32_t>(out);
+  ASSERT_EQ(out2[0], 123);
+  ASSERT_EQ(out2[1], 345);
+}
+
+void testFeatureNodeSerializeInt64() {
+  std::string out =
+      distributed::FeatureNode::parse_value_to_bytes<int64_t>({"123", "345"});
+  std::vector<int64_t> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<int64_t>(out);
+  ASSERT_EQ(out2[0], 123);
+  ASSERT_EQ(out2[1], 345);
+}
+
+void testFeatureNodeSerializeFloat32() {
+  std::string out = distributed::FeatureNode::parse_value_to_bytes<float>(
+      {"123.123", "345.123"});
+  std::vector<float> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<float>(out);
+  float eps;
+  std::cout << "Float " << out2[0] << " " << 123.123 << std::endl;
+  eps = out2[0] - 123.123;
+  ASSERT_LE(eps * eps, 1e-5);
+  eps = out2[1] - 345.123;
+  ASSERT_LE(eps * eps, 1e-5);
+}
+
+void testFeatureNodeSerializeFloat64() {
+  std::string out = distributed::FeatureNode::parse_value_to_bytes<double>(
+      {"123.123", "345.123"});
+  std::vector<double> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<double>(out);
+  float eps;
+  eps = out2[0] - 123.123;
+  std::cout << "Float64 " << out2[0] << " " << 123.123 << std::endl;
+  ASSERT_LE(eps * eps, 1e-5);
+  eps = out2[1] - 345.123;
+  ASSERT_LE(eps * eps, 1e-5);
+}
+
+void testSingleSampleNeighboor(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> vs;
+  auto pull_status = worker_ptr_->batch_sample_neighboors(
+      0, std::vector<uint64_t>(1, 37), 4, vs);
+  pull_status.wait();
+
+  std::unordered_set<uint64_t> s;
+  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  for (auto g : vs[0]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+  VLOG(0) << "test single done";
+  s.clear();
+  s1.clear();
+  vs.clear();
+  pull_status = worker_ptr_->batch_sample_neighboors(
+      0, std::vector<uint64_t>(1, 96), 4, vs);
+  pull_status.wait();
+  s1 = {111, 48, 247};
+  for (auto g : vs[0]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+}
+
+void testBatchSampleNeighboor(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> vs;
+  std::vector<std::uint64_t> v = {37, 96};
+  auto pull_status = worker_ptr_->batch_sample_neighboors(0, v, 4, vs);
+  pull_status.wait();
+  std::unordered_set<uint64_t> s;
+  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  for (auto g : vs[0]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+  s.clear();
+  s1.clear();
+  s1 = {111, 48, 247};
+  for (auto g : vs[1]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+}
+
+void testGraphToBuffer();
+// std::string nodes[] = {std::string("37\taa\t45;0.34\t145;0.31\t112;0.21"),
+//                        std::string("96\tfeature\t48;1.4\t247;0.31\t111;1.21"),
+//                        std::string("59\ttreat\t45;0.34\t145;0.31\t112;0.21"),
+//                        std::string("97\tfood\t48;1.4\t247;0.31\t111;1.21")};
+
+std::string edges[] = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+char edge_file_name[] = "edges.txt";
+
+std::string nodes[] = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+void prepare_file(char file_name[], bool load_edge) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  if (load_edge) {
+    for (auto x : edges) {
+      ofile << x << std::endl;
+    }
+  } else {
+    for (auto x : nodes) {
+      ofile << x << std::endl;
+    }
+  }
+  ofile.close();
+}
+void GetDownpourSparseTableProto(
+    ::paddle::distributed::TableParameter* sparse_table_proto) {
+  sparse_table_proto->set_table_id(0);
+  sparse_table_proto->set_table_class("GraphTable");
+  sparse_table_proto->set_shard_num(127);
+  sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+  ::paddle::distributed::TableAccessorParameter* accessor_proto =
+      sparse_table_proto->mutable_accessor();
+  accessor_proto->set_accessor_class("CommMergeAccessor");
+}
+
+::paddle::distributed::PSParameter GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(sparse_table_proto);
+  return server_fleet_desc;
+}
+
+::paddle::distributed::PSParameter GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+
+  ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+      downpour_worker_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(worker_sparse_table_proto);
+
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* server_sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(server_sparse_table_proto);
+
+  return worker_fleet_desc;
+}
+
+/*-------------------------------------------------------------------------*/
+
+std::string ip_ = "127.0.0.1", ip2 = "127.0.0.1";
+uint32_t port_ = 5209, port2 = 5210;
+
+std::vector<std::string> host_sign_list_;
+
+std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr_,
+    pserver_ptr2;
+
+std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr_;
+
+void RunServer() {
+  LOG(INFO) << "init first server";
+  ::paddle::distributed::PSParameter server_proto = GetServerProto();
+
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto));
+  std::vector<framework::ProgramDesc> empty_vec;
+  framework::ProgramDesc empty_prog;
+  empty_vec.push_back(empty_prog);
+  pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec);
+  LOG(INFO) << "first server, run start(ip,port)";
+  pserver_ptr_->start(ip_, port_);
+  LOG(INFO) << "init first server Done";
+}
+
+void RunServer2() {
+  LOG(INFO) << "init second server";
+  ::paddle::distributed::PSParameter server_proto2 = GetServerProto();
+
+  auto _ps_env2 = paddle::distributed::PaddlePSEnvironment();
+  _ps_env2.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr2 = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto2));
+  std::vector<framework::ProgramDesc> empty_vec2;
+  framework::ProgramDesc empty_prog2;
+  empty_vec2.push_back(empty_prog2);
+  pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2);
+  pserver_ptr2->start(ip2, port2);
+}
+
+void RunClient(
+    std::map<uint64_t, std::vector<paddle::distributed::Region>>& dense_regions,
+    int index, paddle::distributed::PsBaseService* service) {
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list_.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, servers_);
+  worker_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcClient>(
+      (paddle::distributed::GraphBrpcClient*)
+          paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0);
+  worker_ptr_->set_shard_num(127);
+  worker_ptr_->set_local_channel(index);
+  worker_ptr_->set_local_graph_service(
+      (paddle::distributed::GraphBrpcService*)service);
+}
+
+void RunBrpcPushSparse() {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  prepare_file(edge_file_name, 1);
+  prepare_file(node_file_name, 0);
+  auto ph_host = paddle::distributed::PSHost(ip_, port_, 0);
+  host_sign_list_.push_back(ph_host.serialize_to_string());
+
+  // test-start
+  auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1);
+  host_sign_list_.push_back(ph_host2.serialize_to_string());
+  // test-end
+  // Srart Server
+  std::thread* server_thread = new std::thread(RunServer);
+  std::thread* server_thread2 = new std::thread(RunServer2);
+  sleep(1);
+
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+
+  RunClient(dense_regions, 0, pserver_ptr_->get_service());
+
+  /*-----------------------Test Server Init----------------------------------*/
+  auto pull_status =
+      worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
+  srand(time(0));
+  pull_status.wait();
+  std::vector<std::vector<std::pair<uint64_t, float>>> vs;
+  testSampleNodes(worker_ptr_);
+  sleep(5);
+  testSingleSampleNeighboor(worker_ptr_);
+  testBatchSampleNeighboor(worker_ptr_);
+  pull_status = worker_ptr_->batch_sample_neighboors(
+      0, std::vector<uint64_t>(1, 10240001024), 4, vs);
+  pull_status.wait();
+  ASSERT_EQ(0, vs[0].size());
+
+  std::vector<distributed::FeatureNode> nodes;
+  pull_status = worker_ptr_->pull_graph_list(0, 0, 0, 1, 1, nodes);
+  pull_status.wait();
+  ASSERT_EQ(nodes.size(), 1);
+  ASSERT_EQ(nodes[0].get_id(), 37);
+  nodes.clear();
+  pull_status = worker_ptr_->pull_graph_list(0, 0, 1, 4, 1, nodes);
+  pull_status.wait();
+  ASSERT_EQ(nodes.size(), 1);
+  ASSERT_EQ(nodes[0].get_id(), 59);
+  for (auto g : nodes) {
+    std::cout << g.get_id() << std::endl;
+  }
+  distributed::GraphPyServer server1, server2;
+  distributed::GraphPyClient client1, client2;
+  std::string ips_str = "127.0.0.1:5211;127.0.0.1:5212";
+  std::vector<std::string> edge_types = {std::string("user2item")};
+  std::vector<std::string> node_types = {std::string("user"),
+                                         std::string("item")};
+  VLOG(0) << "make 2 servers";
+  server1.set_up(ips_str, 127, node_types, edge_types, 0);
+  server2.set_up(ips_str, 127, node_types, edge_types, 1);
+
+  server1.add_table_feat_conf("user", "a", "float32", 1);
+  server1.add_table_feat_conf("user", "b", "int32", 2);
+  server1.add_table_feat_conf("user", "c", "string", 1);
+  server1.add_table_feat_conf("user", "d", "string", 1);
+  server1.add_table_feat_conf("item", "a", "float32", 1);
+
+  server2.add_table_feat_conf("user", "a", "float32", 1);
+  server2.add_table_feat_conf("user", "b", "int32", 2);
+  server2.add_table_feat_conf("user", "c", "string", 1);
+  server2.add_table_feat_conf("user", "d", "string", 1);
+  server2.add_table_feat_conf("item", "a", "float32", 1);
+
+  client1.set_up(ips_str, 127, node_types, edge_types, 0);
+
+  client1.add_table_feat_conf("user", "a", "float32", 1);
+  client1.add_table_feat_conf("user", "b", "int32", 2);
+  client1.add_table_feat_conf("user", "c", "string", 1);
+  client1.add_table_feat_conf("user", "d", "string", 1);
+  client1.add_table_feat_conf("item", "a", "float32", 1);
+
+  client2.set_up(ips_str, 127, node_types, edge_types, 1);
+
+  client2.add_table_feat_conf("user", "a", "float32", 1);
+  client2.add_table_feat_conf("user", "b", "int32", 2);
+  client2.add_table_feat_conf("user", "c", "string", 1);
+  client2.add_table_feat_conf("user", "d", "string", 1);
+  client2.add_table_feat_conf("item", "a", "float32", 1);
+
+  server1.start_server(false);
+  std::cout << "first server done" << std::endl;
+  server2.start_server(false);
+  std::cout << "second server done" << std::endl;
+  client1.start_client();
+  std::cout << "first client done" << std::endl;
+  client2.start_client();
+  std::cout << "first client done" << std::endl;
+  std::cout << "started" << std::endl;
+  VLOG(0) << "come to set local server";
+  client1.bind_local_server(0, server1);
+  VLOG(0) << "first bound";
+  client2.bind_local_server(1, server2);
+  VLOG(0) << "second bound";
+  client1.load_node_file(std::string("user"), std::string(node_file_name));
+  client1.load_node_file(std::string("item"), std::string(node_file_name));
+  client1.load_edge_file(std::string("user2item"), std::string(edge_file_name),
+                         0);
+  nodes.clear();
+
+  nodes = client1.pull_graph_list(std::string("user"), 0, 1, 4, 1);
+
+  ASSERT_EQ(nodes[0].get_id(), 59);
+  nodes.clear();
+
+  // Test Pull by step
+
+  std::unordered_set<uint64_t> count_item_nodes;
+  // pull by step 2
+  for (int test_step = 1; test_step < 4; test_step++) {
+    count_item_nodes.clear();
+    std::cout << "check pull graph list by step " << test_step << std::endl;
+    for (int server_id = 0; server_id < 2; server_id++) {
+      for (int start_step = 0; start_step < test_step; start_step++) {
+        nodes = client1.pull_graph_list(std::string("item"), server_id,
+                                        start_step, 12, test_step);
+        for (auto g : nodes) {
+          count_item_nodes.insert(g.get_id());
+        }
+        nodes.clear();
+      }
+    }
+    ASSERT_EQ(count_item_nodes.size(), 12);
+  }
+
+  vs = client1.batch_sample_neighboors(std::string("user2item"),
+                                       std::vector<uint64_t>(1, 96), 4);
+  ASSERT_EQ(vs[0].size(), 3);
+  std::vector<uint64_t> node_ids;
+  node_ids.push_back(96);
+  node_ids.push_back(37);
+  vs = client1.batch_sample_neighboors(std::string("user2item"), node_ids, 4);
+
+  ASSERT_EQ(vs.size(), 2);
+  std::vector<uint64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
+  ASSERT_EQ(nodes_ids.size(), 2);
+  ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) ||
+                      (nodes_ids[0] == 37 && nodes_ids[1] == 59));
+
+  // Test get node feat
+  node_ids.clear();
+  node_ids.push_back(37);
+  node_ids.push_back(96);
+  std::vector<std::string> feature_names;
+  feature_names.push_back(std::string("c"));
+  feature_names.push_back(std::string("d"));
+  auto node_feat =
+      client1.get_node_feat(std::string("user"), node_ids, feature_names);
+  ASSERT_EQ(node_feat.size(), 2);
+  ASSERT_EQ(node_feat[0].size(), 2);
+  VLOG(0) << "get_node_feat: " << node_feat[0][0];
+  VLOG(0) << "get_node_feat: " << node_feat[0][1];
+  VLOG(0) << "get_node_feat: " << node_feat[1][0];
+  VLOG(0) << "get_node_feat: " << node_feat[1][1];
+
+  // Test string
+  node_ids.clear();
+  node_ids.push_back(37);
+  node_ids.push_back(96);
+  // std::vector<std::string> feature_names;
+  feature_names.clear();
+  feature_names.push_back(std::string("a"));
+  feature_names.push_back(std::string("b"));
+  node_feat =
+      client1.get_node_feat(std::string("user"), node_ids, feature_names);
+  ASSERT_EQ(node_feat.size(), 2);
+  ASSERT_EQ(node_feat[0].size(), 2);
+  VLOG(0) << "get_node_feat: " << node_feat[0][0].size();
+  VLOG(0) << "get_node_feat: " << node_feat[0][1].size();
+  VLOG(0) << "get_node_feat: " << node_feat[1][0].size();
+  VLOG(0) << "get_node_feat: " << node_feat[1][1].size();
+
+  std::remove(edge_file_name);
+  std::remove(node_file_name);
+  LOG(INFO) << "Run stop_server";
+  worker_ptr_->stop_server();
+  LOG(INFO) << "Run finalize_worker";
+  worker_ptr_->finalize_worker();
+  testFeatureNodeSerializeInt();
+  testFeatureNodeSerializeInt64();
+  testFeatureNodeSerializeFloat32();
+  testFeatureNodeSerializeFloat64();
+  testGraphToBuffer();
+  client1.stop_server();
+}
+
+void testGraphToBuffer() {
+  ::paddle::distributed::GraphNode s, s1;
+  s.set_feature_size(1);
+  s.set_feature(0, std::string("hhhh"));
+  s.set_id(65);
+  int size = s.get_size(true);
+  char str[size];
+  s.to_buffer(str, true);
+  s1.recover_from_buffer(str);
+  ASSERT_EQ(s.get_id(), s1.get_id());
+  VLOG(0) << s.get_feature(0);
+  VLOG(0) << s1.get_feature(0);
+}
+
+TEST(RunBrpcPushSparse, Run) { RunBrpcPushSparse(); }
diff --git a/paddle/fluid/distributed/test/sparse_table_test.cc b/paddle/fluid/distributed/test/sparse_table_test.cc
index 6db95c5fac211b94db726ee77c9122a8824c2351..26bede392d6fade06dd29cf5e5a28295bb1cbc43 100644
--- a/paddle/fluid/distributed/test/sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/sparse_table_test.cc
@@ -55,9 +55,14 @@ TEST(CommonSparseTable, SGD) {
 
   // pull parameters for create and check
   std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+
   std::vector<float> init_values;
   init_values.resize(init_keys.size() * emb_dim);
-  table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size());
+
+  std::vector<float> pull_values(init_values.size());
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(init_values.data(), value);
 
   // for check
   std::vector<float> total_gradients;
@@ -100,7 +105,8 @@ TEST(CommonSparseTable, SGD) {
 
   std::vector<float> pull_values;
   pull_values.resize(init_keys.size() * emb_dim);
-  table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
+  table->pull_sparse(init_values.data(), value);
+
   for (size_t i = 0; i < init_values.size(); ++i) {
     auto update_val = init_values[i] - 1.0 * total_gradients[i];
     ASSERT_TRUE(abs(update_val - pull_values[i]) < 1e-5);
@@ -148,9 +154,13 @@ TEST(CommonSparseTable, Adam) {
 
   // pull parameters for create and check
   std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+
   std::vector<float> init_values;
   init_values.resize(init_keys.size() * emb_dim);
-  table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size());
+
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(init_values.data(), value);
 
   // push gradient
   std::vector<std::vector<uint64_t>> trainer_keys;
diff --git a/paddle/fluid/distributed/thirdparty/round_robin.h b/paddle/fluid/distributed/thirdparty/round_robin.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5075b4545af04b7179fcd3ba37bea3ecd58b6c8
--- /dev/null
+++ b/paddle/fluid/distributed/thirdparty/round_robin.h
@@ -0,0 +1,2685 @@
+//                 ______  _____                 ______                _________
+//  ______________ ___  /_ ___(_)_______         ___  /_ ______ ______ ______  /
+//  __  ___/_  __ \__  __ \__  / __  __ \        __  __ \_  __ \_  __ \_  __  /
+//  _  /    / /_/ /_  /_/ /_  /  _  / / /        _  / / // /_/ // /_/ // /_/ /
+//  /_/     \____/ /_.___/ /_/   /_/ /_/ ________/_/ /_/ \____/ \____/ \__,_/
+//                                      _/_____/
+//
+// Fast & memory efficient hashtable based on robin hood hashing for
+// C++11/14/17/20
+// https://github.com/martinus/robin-hood-hashing
+//
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2021 Martin Ankerl <http://martin.ankerl.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ROBIN_HOOD_H_INCLUDED
+#define ROBIN_HOOD_H_INCLUDED
+
+// see https://semver.org/
+#define ROBIN_HOOD_VERSION_MAJOR 3  // for incompatible API changes
+#define ROBIN_HOOD_VERSION_MINOR \
+  11  // for adding functionality in a backwards-compatible manner
+#define ROBIN_HOOD_VERSION_PATCH 1  // for backwards-compatible bug fixes
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <memory>  // only to support hash of smart pointers
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#if __cplusplus >= 201703L
+#include <string_view>
+#endif
+
+// #define ROBIN_HOOD_LOG_ENABLED
+#ifdef ROBIN_HOOD_LOG_ENABLED
+#include <iostream>
+#define ROBIN_HOOD_LOG(...)                                           \
+  std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \
+            << std::endl;
+#else
+#define ROBIN_HOOD_LOG(x)
+#endif
+
+// #define ROBIN_HOOD_TRACE_ENABLED
+#ifdef ROBIN_HOOD_TRACE_ENABLED
+#include <iostream>
+#define ROBIN_HOOD_TRACE(...)                                         \
+  std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \
+            << std::endl;
+#else
+#define ROBIN_HOOD_TRACE(x)
+#endif
+
+// #define ROBIN_HOOD_COUNT_ENABLED
+#ifdef ROBIN_HOOD_COUNT_ENABLED
+#include <iostream>
+#define ROBIN_HOOD_COUNT(x) ++counts().x;
+namespace robin_hood {
+struct Counts {
+  uint64_t shiftUp{};
+  uint64_t shiftDown{};
+};
+inline std::ostream &operator<<(std::ostream &os, Counts const &c) {
+  return os << c.shiftUp << " shiftUp" << std::endl
+            << c.shiftDown << " shiftDown" << std::endl;
+}
+
+static Counts &counts() {
+  static Counts counts{};
+  return counts;
+}
+}  // namespace robin_hood
+#else
+#define ROBIN_HOOD_COUNT(x)
+#endif
+
+// all non-argument macros should use this facility. See
+// https://www.fluentcpp.com/2019/05/28/better-macros-better-flags/
+#define ROBIN_HOOD(x) ROBIN_HOOD_PRIVATE_DEFINITION_##x()
+
+// mark unused members with this macro
+#define ROBIN_HOOD_UNUSED(identifier)
+
+// bitness
+#if SIZE_MAX == UINT32_MAX
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 32
+#elif SIZE_MAX == UINT64_MAX
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 64
+#else
+#error Unsupported bitness
+#endif
+
+// endianess
+#ifdef _MSC_VER
+#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() \
+  (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() \
+  (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#endif
+
+// inline
+#ifdef _MSC_VER
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __declspec(noinline)
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __attribute__((noinline))
+#endif
+
+// exceptions
+#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 0
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 1
+#endif
+
+// count leading/trailing bits
+#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+#ifdef _MSC_VER
+#if ROBIN_HOOD(BITNESS) == 32
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64
+#endif
+#include <intrin.h>
+#pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD))
+#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x)                                   \
+  [](size_t mask) noexcept->int {                                             \
+    unsigned long index;                                                      \
+    return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast<int>(index) \
+                                                    : ROBIN_HOOD(BITNESS);    \
+  }                                                                           \
+  (x)
+#else
+#if ROBIN_HOOD(BITNESS) == 32
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll
+#endif
+#define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) \
+  ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS))
+#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \
+  ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS))
+#endif
+#endif
+
+// fallthrough
+#ifndef __has_cpp_attribute  // For backwards compatibility
+#define __has_cpp_attribute(x) 0
+#endif
+#if __has_cpp_attribute(clang::fallthrough)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[clang::fallthrough]]
+#elif __has_cpp_attribute(gnu::fallthrough)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[gnu::fallthrough]]
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH()
+#endif
+
+// likely/unlikely
+#ifdef _MSC_VER
+#define ROBIN_HOOD_LIKELY(condition) condition
+#define ROBIN_HOOD_UNLIKELY(condition) condition
+#else
+#define ROBIN_HOOD_LIKELY(condition) __builtin_expect(condition, 1)
+#define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0)
+#endif
+
+// detect if native wchar_t type is availiable in MSVC
+#ifdef _MSC_VER
+#ifdef _NATIVE_WCHAR_T_DEFINED
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0
+#endif
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#endif
+
+// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor
+// being constexpr
+#ifdef _MSC_VER
+#if _MSC_VER <= 1900
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 1
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+
+// workaround missing "is_trivially_copyable" in g++ < 5.0
+// See https://stackoverflow.com/a/31798726/48181
+#if defined(__GNUC__) && __GNUC__ < 5
+#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+#else
+#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) \
+  std::is_trivially_copyable<__VA_ARGS__>::value
+#endif
+
+// helpers for C++ versions, see
+// https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX() __cplusplus
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX98() 199711L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX11() 201103L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX14() 201402L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX17() 201703L
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() [[nodiscard]]
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD()
+#endif
+
+namespace robin_hood {
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+#define ROBIN_HOOD_STD std
+#else
+
+// c++11 compatibility layer
+namespace ROBIN_HOOD_STD {
+template <class T>
+struct alignment_of
+    : std::integral_constant<
+          std::size_t, alignof(typename std::remove_all_extents<T>::type)> {};
+
+template <class T, T... Ints>
+class integer_sequence {
+ public:
+  using value_type = T;
+  static_assert(std::is_integral<value_type>::value, "not integral type");
+  static constexpr std::size_t size() noexcept { return sizeof...(Ints); }
+};
+template <std::size_t... Inds>
+using index_sequence = integer_sequence<std::size_t, Inds...>;
+
+namespace detail_ {
+template <class T, T Begin, T End, bool>
+struct IntSeqImpl {
+  using TValue = T;
+  static_assert(std::is_integral<TValue>::value, "not integral type");
+  static_assert(Begin >= 0 && Begin < End,
+                "unexpected argument (Begin<0 || Begin<=End)");
+
+  template <class, class>
+  struct IntSeqCombiner;
+
+  template <TValue... Inds0, TValue... Inds1>
+  struct IntSeqCombiner<integer_sequence<TValue, Inds0...>,
+                        integer_sequence<TValue, Inds1...>> {
+    using TResult = integer_sequence<TValue, Inds0..., Inds1...>;
+  };
+
+  using TResult = typename IntSeqCombiner<
+      typename IntSeqImpl<TValue, Begin, Begin + (End - Begin) / 2,
+                          (End - Begin) / 2 == 1>::TResult,
+      typename IntSeqImpl<TValue, Begin + (End - Begin) / 2, End,
+                          (End - Begin + 1) / 2 == 1>::TResult>::TResult;
+};
+
+template <class T, T Begin>
+struct IntSeqImpl<T, Begin, Begin, false> {
+  using TValue = T;
+  static_assert(std::is_integral<TValue>::value, "not integral type");
+  static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+  using TResult = integer_sequence<TValue>;
+};
+
+template <class T, T Begin, T End>
+struct IntSeqImpl<T, Begin, End, true> {
+  using TValue = T;
+  static_assert(std::is_integral<TValue>::value, "not integral type");
+  static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+  using TResult = integer_sequence<TValue, Begin>;
+};
+}  // namespace detail_
+
+template <class T, T N>
+using make_integer_sequence =
+    typename detail_::IntSeqImpl<T, 0, N, (N - 0) == 1>::TResult;
+
+template <std::size_t N>
+using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+template <class... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+}  // namespace ROBIN_HOOD_STD
+
+#endif
+
+namespace detail {
+
+// make sure we static_cast to the correct type for hash_int
+#if ROBIN_HOOD(BITNESS) == 64
+using SizeT = uint64_t;
+#else
+using SizeT = uint32_t;
+#endif
+
+template <typename T>
+T rotr(T x, unsigned k) {
+  return (x >> k) | (x << (8U * sizeof(T) - k));
+}
+
+// This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned
+// char*'} to
+// 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target
+// type". Use with
+// care!
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void *ptr) noexcept {
+  return reinterpret_cast<T>(ptr);
+}
+
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void const *ptr) noexcept {
+  return reinterpret_cast<T>(ptr);
+}
+
+// make sure this is not inlined as it is slow and dramatically enlarges code,
+// thus making other
+// inlinings more difficult. Throws are also generally the slow path.
+template <typename E, typename... Args>
+[[noreturn]] ROBIN_HOOD(NOINLINE)
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+    void doThrow(Args &&... args) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
+  throw E(std::forward<Args>(args)...);
+}
+#else
+    void doThrow(Args &&... ROBIN_HOOD_UNUSED(args) /*unused*/) {
+  abort();
+}
+#endif
+
+template <typename E, typename T, typename... Args>
+T *assertNotNull(T *t, Args &&... args) {
+  if (ROBIN_HOOD_UNLIKELY(nullptr == t)) {
+    doThrow<E>(std::forward<Args>(args)...);
+  }
+  return t;
+}
+
+template <typename T>
+inline T unaligned_load(void const *ptr) noexcept {
+  // using memcpy so we don't get into unaligned load problems.
+  // compiler should optimize this very well anyways.
+  T t;
+  std::memcpy(&t, ptr, sizeof(T));
+  return t;
+}
+
+// Allocates bulks of memory for objects of type T. This deallocates the memory
+// in the destructor,
+// and keeps a linked list of the allocated memory around. Overhead per
+// allocation is the size of a
+// pointer.
+template <typename T, size_t MinNumAllocs = 4, size_t MaxNumAllocs = 256>
+class BulkPoolAllocator {
+ public:
+  BulkPoolAllocator() noexcept = default;
+
+  // does not copy anything, just creates a new allocator.
+  BulkPoolAllocator(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(
+      o) /*unused*/) noexcept : mHead(nullptr),
+                                mListForFree(nullptr) {}
+
+  BulkPoolAllocator(BulkPoolAllocator &&o) noexcept
+      : mHead(o.mHead),
+        mListForFree(o.mListForFree) {
+    o.mListForFree = nullptr;
+    o.mHead = nullptr;
+  }
+
+  BulkPoolAllocator &operator=(BulkPoolAllocator &&o) noexcept {
+    reset();
+    mHead = o.mHead;
+    mListForFree = o.mListForFree;
+    o.mListForFree = nullptr;
+    o.mHead = nullptr;
+    return *this;
+  }
+
+  BulkPoolAllocator &
+  // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+  operator=(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept {
+    // does not do anything
+    return *this;
+  }
+
+  ~BulkPoolAllocator() noexcept { reset(); }
+
+  // Deallocates all allocated memory.
+  void reset() noexcept {
+    while (mListForFree) {
+      T *tmp = *mListForFree;
+      ROBIN_HOOD_LOG("std::free")
+      std::free(mListForFree);
+      mListForFree = reinterpret_cast_no_cast_align_warning<T **>(tmp);
+    }
+    mHead = nullptr;
+  }
+
+  // allocates, but does NOT initialize. Use in-place new constructor, e.g.
+  //   T* obj = pool.allocate();
+  //   ::new (static_cast<void*>(obj)) T();
+  T *allocate() {
+    T *tmp = mHead;
+    if (!tmp) {
+      tmp = performAllocation();
+    }
+
+    mHead = *reinterpret_cast_no_cast_align_warning<T **>(tmp);
+    return tmp;
+  }
+
+  // does not actually deallocate but puts it in store.
+  // make sure you have already called the destructor! e.g. with
+  //  obj->~T();
+  //  pool.deallocate(obj);
+  void deallocate(T *obj) noexcept {
+    *reinterpret_cast_no_cast_align_warning<T **>(obj) = mHead;
+    mHead = obj;
+  }
+
+  // Adds an already allocated block of memory to the allocator. This allocator
+  // is from now on
+  // responsible for freeing the data (with free()). If the provided data is not
+  // large enough to
+  // make use of, it is immediately freed. Otherwise it is reused and freed in
+  // the destructor.
+  void addOrFree(void *ptr, const size_t numBytes) noexcept {
+    // calculate number of available elements in ptr
+    if (numBytes < ALIGNMENT + ALIGNED_SIZE) {
+      // not enough data for at least one element. Free and return.
+      ROBIN_HOOD_LOG("std::free")
+      std::free(ptr);
+    } else {
+      ROBIN_HOOD_LOG("add to buffer")
+      add(ptr, numBytes);
+    }
+  }
+
+  void swap(BulkPoolAllocator<T, MinNumAllocs, MaxNumAllocs> &other) noexcept {
+    using std::swap;
+    swap(mHead, other.mHead);
+    swap(mListForFree, other.mListForFree);
+  }
+
+ private:
+  // iterates the list of allocated memory to calculate how many to alloc next.
+  // Recalculating this each time saves us a size_t member.
+  // This ignores the fact that memory blocks might have been added manually
+  // with addOrFree. In
+  // practice, this should not matter much.
+  ROBIN_HOOD(NODISCARD) size_t calcNumElementsToAlloc() const noexcept {
+    auto tmp = mListForFree;
+    size_t numAllocs = MinNumAllocs;
+
+    while (numAllocs * 2 <= MaxNumAllocs && tmp) {
+      auto x = reinterpret_cast<T ***>(tmp);
+      tmp = *x;
+      numAllocs *= 2;
+    }
+
+    return numAllocs;
+  }
+
+  // WARNING: Underflow if numBytes < ALIGNMENT! This is guarded in addOrFree().
+  void add(void *ptr, const size_t numBytes) noexcept {
+    const size_t numElements = (numBytes - ALIGNMENT) / ALIGNED_SIZE;
+
+    auto data = reinterpret_cast<T **>(ptr);
+
+    // link free list
+    auto x = reinterpret_cast<T ***>(data);
+    *x = mListForFree;
+    mListForFree = data;
+
+    // create linked list for newly allocated data
+    auto *const headT = reinterpret_cast_no_cast_align_warning<T *>(
+        reinterpret_cast<char *>(ptr) + ALIGNMENT);
+
+    auto *const head = reinterpret_cast<char *>(headT);
+
+    // Visual Studio compiler automatically unrolls this loop, which is pretty
+    // cool
+    for (size_t i = 0; i < numElements; ++i) {
+      *reinterpret_cast_no_cast_align_warning<char **>(
+          head + i * ALIGNED_SIZE) = head + (i + 1) * ALIGNED_SIZE;
+    }
+
+    // last one points to 0
+    *reinterpret_cast_no_cast_align_warning<T **>(
+        head + (numElements - 1) * ALIGNED_SIZE) = mHead;
+    mHead = headT;
+  }
+
+  // Called when no memory is available (mHead == 0).
+  // Don't inline this slow path.
+  ROBIN_HOOD(NOINLINE) T *performAllocation() {
+    size_t const numElementsToAlloc = calcNumElementsToAlloc();
+
+    // alloc new memory: [prev |T, T, ... T]
+    size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc;
+    ROBIN_HOOD_LOG("std::malloc " << bytes << " = " << ALIGNMENT << " + "
+                                  << ALIGNED_SIZE << " * "
+                                  << numElementsToAlloc)
+    add(assertNotNull<std::bad_alloc>(std::malloc(bytes)), bytes);
+    return mHead;
+  }
+
+// enforce byte alignment of the T's
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+  static constexpr size_t ALIGNMENT =
+      (std::max)(std::alignment_of<T>::value, std::alignment_of<T *>::value);
+#else
+  static const size_t ALIGNMENT =
+      (ROBIN_HOOD_STD::alignment_of<T>::value >
+       ROBIN_HOOD_STD::alignment_of<T *>::value)
+          ? ROBIN_HOOD_STD::alignment_of<T>::value
+          : +ROBIN_HOOD_STD::alignment_of<T *>::value;  // the + is for
+                                                        // walkarround
+#endif
+
+  static constexpr size_t ALIGNED_SIZE =
+      ((sizeof(T) - 1) / ALIGNMENT + 1) * ALIGNMENT;
+
+  static_assert(MinNumAllocs >= 1, "MinNumAllocs");
+  static_assert(MaxNumAllocs >= MinNumAllocs, "MaxNumAllocs");
+  static_assert(ALIGNED_SIZE >= sizeof(T *), "ALIGNED_SIZE");
+  static_assert(0 == (ALIGNED_SIZE % sizeof(T *)), "ALIGNED_SIZE mod");
+  static_assert(ALIGNMENT >= sizeof(T *), "ALIGNMENT");
+
+  T *mHead{nullptr};
+  T **mListForFree{nullptr};
+};
+
+template <typename T, size_t MinSize, size_t MaxSize, bool IsFlat>
+struct NodeAllocator;
+
+// dummy allocator that does nothing
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, true> {
+  // we are not using the data, so just free it.
+  void addOrFree(void *ptr,
+                 size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept {
+    ROBIN_HOOD_LOG("std::free")
+    std::free(ptr);
+  }
+};
+
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, false>
+    : public BulkPoolAllocator<T, MinSize, MaxSize> {};
+
+// c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it
+// either, so I'm making
+// my own here.
+namespace swappable {
+#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17)
+using std::swap;
+template <typename T>
+struct nothrow {
+  static const bool value =
+      noexcept(swap(std::declval<T &>(), std::declval<T &>()));
+};
+#else
+template <typename T>
+struct nothrow {
+  static const bool value = std::is_nothrow_swappable<T>::value;
+};
+#endif
+}  // namespace swappable
+
+}  // namespace detail
+
+struct is_transparent_tag {};
+
+// A custom pair implementation is used in the map because std::pair is not
+// is_trivially_copyable,
+// which means it would  not be allowed to be used in std::memcpy. This struct
+// is copyable, which is
+// also tested.
+template <typename T1, typename T2>
+struct pair {
+  using first_type = T1;
+  using second_type = T2;
+
+  template <typename U1 = T1, typename U2 = T2,
+            typename = typename std::enable_if<
+                std::is_default_constructible<U1>::value &&
+                std::is_default_constructible<U2>::value>::type>
+  constexpr pair() noexcept(noexcept(U1()) && noexcept(U2()))
+      : first(), second() {}
+
+  // pair constructors are explicit so we don't accidentally call this ctor when
+  // we don't have to.
+  explicit constexpr pair(std::pair<T1, T2> const &o) noexcept(
+      noexcept(T1(std::declval<T1 const &>())) &&
+      noexcept(T2(std::declval<T2 const &>())))
+      : first(o.first), second(o.second) {}
+
+  // pair constructors are explicit so we don't accidentally call this ctor when
+  // we don't have to.
+  explicit constexpr pair(std::pair<T1, T2> &&o) noexcept(
+      noexcept(T1(std::move(std::declval<T1 &&>()))) &&
+      noexcept(T2(std::move(std::declval<T2 &&>()))))
+      : first(std::move(o.first)), second(std::move(o.second)) {}
+
+  constexpr pair(T1 &&a, T2 &&b) noexcept(
+      noexcept(T1(std::move(std::declval<T1 &&>()))) &&
+      noexcept(T2(std::move(std::declval<T2 &&>()))))
+      : first(std::move(a)), second(std::move(b)) {}
+
+  template <typename U1, typename U2>
+  constexpr pair(U1 &&a, U2 &&b) noexcept(
+      noexcept(T1(std::forward<U1>(std::declval<U1 &&>()))) &&
+      noexcept(T2(std::forward<U2>(std::declval<U2 &&>()))))
+      : first(std::forward<U1>(a)), second(std::forward<U2>(b)) {}
+
+  template <typename... U1, typename... U2>
+// MSVC 2015 produces error "C2476: ‘constexpr’ constructor does not initialize
+// all members"
+// if this constructor is constexpr
+#if !ROBIN_HOOD(BROKEN_CONSTEXPR)
+  constexpr
+#endif
+      pair(std::piecewise_construct_t /*unused*/, std::tuple<U1...> a,
+           std::tuple<U2...>
+               b) noexcept(noexcept(pair(std::declval<std::tuple<U1...> &>(),
+                                         std::declval<std::tuple<U2...> &>(),
+                                         ROBIN_HOOD_STD::index_sequence_for<
+                                             U1...>(),
+                                         ROBIN_HOOD_STD::index_sequence_for<
+                                             U2...>())))
+      : pair(a, b, ROBIN_HOOD_STD::index_sequence_for<U1...>(),
+             ROBIN_HOOD_STD::index_sequence_for<U2...>()) {
+  }
+
+  // constructor called from the std::piecewise_construct_t ctor
+  template <typename... U1, size_t... I1, typename... U2, size_t... I2>
+  pair(
+      std::tuple<U1...> &a, std::tuple<U2...> &b,
+      ROBIN_HOOD_STD::index_sequence<I1...> /*unused*/,
+      ROBIN_HOOD_STD::index_sequence<
+          I2...> /*unused*/) noexcept(noexcept(T1(std::
+                                                      forward<U1>(std::get<I1>(
+                                                          std::declval<
+                                                              std::tuple<U1...>
+                                                                  &>()))...)) &&
+                                      noexcept(T2(std::forward<U2>(std::get<I2>(
+                                          std::declval<
+                                              std::tuple<U2...> &>()))...)))
+      : first(std::forward<U1>(std::get<I1>(a))...),
+        second(std::forward<U2>(std::get<I2>(b))...) {
+    // make visual studio compiler happy about warning about unused a & b.
+    // Visual studio's pair implementation disables warning 4100.
+    (void)a;
+    (void)b;
+  }
+
+  void swap(pair<T1, T2> &o) noexcept((detail::swappable::nothrow<T1>::value) &&
+                                      (detail::swappable::nothrow<T2>::value)) {
+    using std::swap;
+    swap(first, o.first);
+    swap(second, o.second);
+  }
+
+  T1 first;   // NOLINT(misc-non-private-member-variables-in-classes)
+  T2 second;  // NOLINT(misc-non-private-member-variables-in-classes)
+};
+
+template <typename A, typename B>
+inline void swap(pair<A, B> &a, pair<A, B> &b) noexcept(
+    noexcept(std::declval<pair<A, B> &>().swap(std::declval<pair<A, B> &>()))) {
+  a.swap(b);
+}
+
+template <typename A, typename B>
+inline constexpr bool operator==(pair<A, B> const &x, pair<A, B> const &y) {
+  return (x.first == y.first) && (x.second == y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator!=(pair<A, B> const &x, pair<A, B> const &y) {
+  return !(x == y);
+}
+template <typename A, typename B>
+inline constexpr bool
+operator<(pair<A, B> const &x, pair<A, B> const &y) noexcept(
+    noexcept(std::declval<A const &>() < std::declval<A const &>()) &&
+    noexcept(std::declval<B const &>() < std::declval<B const &>())) {
+  return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator>(pair<A, B> const &x, pair<A, B> const &y) {
+  return y < x;
+}
+template <typename A, typename B>
+inline constexpr bool operator<=(pair<A, B> const &x, pair<A, B> const &y) {
+  return !(x > y);
+}
+template <typename A, typename B>
+inline constexpr bool operator>=(pair<A, B> const &x, pair<A, B> const &y) {
+  return !(x < y);
+}
+
+inline size_t hash_bytes(void const *ptr, size_t len) noexcept {
+  static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
+  static constexpr uint64_t seed = UINT64_C(0xe17a1465);
+  static constexpr unsigned int r = 47;
+
+  auto const *const data64 = static_cast<uint64_t const *>(ptr);
+  uint64_t h = seed ^ (len * m);
+
+  size_t const n_blocks = len / 8;
+  for (size_t i = 0; i < n_blocks; ++i) {
+    auto k = detail::unaligned_load<uint64_t>(data64 + i);
+
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+
+    h ^= k;
+    h *= m;
+  }
+
+  auto const *const data8 =
+      reinterpret_cast<uint8_t const *>(data64 + n_blocks);
+  switch (len & 7U) {
+    case 7:
+      h ^= static_cast<uint64_t>(data8[6]) << 48U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 6:
+      h ^= static_cast<uint64_t>(data8[5]) << 40U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 5:
+      h ^= static_cast<uint64_t>(data8[4]) << 32U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 4:
+      h ^= static_cast<uint64_t>(data8[3]) << 24U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 3:
+      h ^= static_cast<uint64_t>(data8[2]) << 16U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 2:
+      h ^= static_cast<uint64_t>(data8[1]) << 8U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 1:
+      h ^= static_cast<uint64_t>(data8[0]);
+      h *= m;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    default:
+      break;
+  }
+
+  h ^= h >> r;
+
+  // not doing the final step here, because this will be done by keyToIdx
+  // anyways
+  // h *= m;
+  // h ^= h >> r;
+  return static_cast<size_t>(h);
+}
+
+inline size_t hash_int(uint64_t x) noexcept {
+  // tried lots of different hashes, let's stick with murmurhash3. It's simple,
+  // fast, well tested,
+  // and doesn't need any special 128bit operations.
+  x ^= x >> 33U;
+  x *= UINT64_C(0xff51afd7ed558ccd);
+  x ^= x >> 33U;
+
+  // not doing the final step here, because this will be done by keyToIdx
+  // anyways
+  // x *= UINT64_C(0xc4ceb9fe1a85ec53);
+  // x ^= x >> 33U;
+  return static_cast<size_t>(x);
+}
+
+// A thin wrapper around std::hash, performing an additional simple mixing step
+// of the result.
+template <typename T, typename Enable = void>
+struct hash : public std::hash<T> {
+  size_t operator()(T const &obj) const noexcept(noexcept(
+      std::declval<std::hash<T>>().operator()(std::declval<T const &>()))) {
+    // call base hash
+    auto result = std::hash<T>::operator()(obj);
+    // return mixed of that, to be save against identity has
+    return hash_int(static_cast<detail::SizeT>(result));
+  }
+};
+
+template <typename CharT>
+struct hash<std::basic_string<CharT>> {
+  size_t operator()(std::basic_string<CharT> const &str) const noexcept {
+    return hash_bytes(str.data(), sizeof(CharT) * str.size());
+  }
+};
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+template <typename CharT>
+struct hash<std::basic_string_view<CharT>> {
+  size_t operator()(std::basic_string_view<CharT> const &sv) const noexcept {
+    return hash_bytes(sv.data(), sizeof(CharT) * sv.size());
+  }
+};
+#endif
+
+template <class T>
+struct hash<T *> {
+  size_t operator()(T *ptr) const noexcept {
+    return hash_int(reinterpret_cast<detail::SizeT>(ptr));
+  }
+};
+
+template <class T>
+struct hash<std::unique_ptr<T>> {
+  size_t operator()(std::unique_ptr<T> const &ptr) const noexcept {
+    return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+  }
+};
+
+template <class T>
+struct hash<std::shared_ptr<T>> {
+  size_t operator()(std::shared_ptr<T> const &ptr) const noexcept {
+    return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+  }
+};
+
+template <typename Enum>
+struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
+  size_t operator()(Enum e) const noexcept {
+    using Underlying = typename std::underlying_type<Enum>::type;
+    return hash<Underlying>{}(static_cast<Underlying>(e));
+  }
+};
+
+#define ROBIN_HOOD_HASH_INT(T)                       \
+  template <>                                        \
+  struct hash<T> {                                   \
+    size_t operator()(T const &obj) const noexcept { \
+      return hash_int(static_cast<uint64_t>(obj));   \
+    }                                                \
+  }
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+// see https://en.cppreference.com/w/cpp/utility/hash
+ROBIN_HOOD_HASH_INT(bool);
+ROBIN_HOOD_HASH_INT(char);
+ROBIN_HOOD_HASH_INT(signed char);
+ROBIN_HOOD_HASH_INT(unsigned char);
+ROBIN_HOOD_HASH_INT(char16_t);
+ROBIN_HOOD_HASH_INT(char32_t);
+#if ROBIN_HOOD(HAS_NATIVE_WCHART)
+ROBIN_HOOD_HASH_INT(wchar_t);
+#endif
+ROBIN_HOOD_HASH_INT(short);
+ROBIN_HOOD_HASH_INT(unsigned short);
+ROBIN_HOOD_HASH_INT(int);
+ROBIN_HOOD_HASH_INT(unsigned int);
+ROBIN_HOOD_HASH_INT(long);
+ROBIN_HOOD_HASH_INT(long long);
+ROBIN_HOOD_HASH_INT(unsigned long);
+ROBIN_HOOD_HASH_INT(unsigned long long);
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+namespace detail {
+
+template <typename T>
+struct void_type {
+  using type = void;
+};
+
+template <typename T, typename = void>
+struct has_is_transparent : public std::false_type {};
+
+template <typename T>
+struct has_is_transparent<T,
+                          typename void_type<typename T::is_transparent>::type>
+    : public std::true_type {};
+
+// using wrapper classes for hash and key_equal prevents the diamond problem
+// when the same type
+// is used. see https://stackoverflow.com/a/28771920/48181
+template <typename T>
+struct WrapHash : public T {
+  WrapHash() = default;
+  explicit WrapHash(T const &o) noexcept(noexcept(T(std::declval<T const &>())))
+      : T(o) {}
+};
+
+template <typename T>
+struct WrapKeyEqual : public T {
+  WrapKeyEqual() = default;
+  explicit WrapKeyEqual(T const &o) noexcept(
+      noexcept(T(std::declval<T const &>())))
+      : T(o) {}
+};
+
+// A highly optimized hashmap implementation, using the Robin Hood algorithm.
+//
+// In most cases, this map should be usable as a drop-in replacement for
+// std::unordered_map, but
+// be about 2x faster in most cases and require much less allocations.
+//
+// This implementation uses the following memory layout:
+//
+// [Node, Node, ... Node | info, info, ... infoSentinel ]
+//
+// * Node: either a DataNode that directly has the std::pair<key, val> as
+// member,
+//   or a DataNode with a pointer to std::pair<key,val>. Which DataNode
+//   representation to use
+//   depends on how fast the swap() operation is. Heuristically, this is
+//   automatically choosen
+//   based on sizeof(). there are always 2^n Nodes.
+//
+// * info: Each Node in the map has a corresponding info byte, so there are 2^n
+// info bytes.
+//   Each byte is initialized to 0, meaning the corresponding Node is empty. Set
+//   to 1 means the
+//   corresponding node contains data. Set to 2 means the corresponding Node is
+//   filled, but it
+//   actually belongs to the previous position and was pushed out because that
+//   place is already
+//   taken.
+//
+// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at
+// end() without the
+//   need for a idx variable.
+//
+// According to STL, order of templates has effect on throughput. That's why
+// I've moved the
+// boolean to the front.
+// https://www.reddit.com/r/cpp/comments/ahp6iu/compile_time_binary_size_reductions_and_cs_future/eeguck4/
+template <bool IsFlat, size_t MaxLoadFactor100, typename Key, typename T,
+          typename Hash, typename KeyEqual>
+class Table
+    : public WrapHash<Hash>,
+      public WrapKeyEqual<KeyEqual>,
+      detail::NodeAllocator<
+          typename std::conditional<
+              std::is_void<T>::value, Key,
+              robin_hood::pair<
+                  typename std::conditional<IsFlat, Key, Key const>::type,
+                  T>>::type,
+          4, 16384, IsFlat> {
+ public:
+  static constexpr bool is_flat = IsFlat;
+  static constexpr bool is_map = !std::is_void<T>::value;
+  static constexpr bool is_set = !is_map;
+  static constexpr bool is_transparent =
+      has_is_transparent<Hash>::value && has_is_transparent<KeyEqual>::value;
+
+  using key_type = Key;
+  using mapped_type = T;
+  using value_type = typename std::conditional<
+      is_set, Key,
+      robin_hood::pair<typename std::conditional<is_flat, Key, Key const>::type,
+                       T>>::type;
+  using size_type = size_t;
+  using hasher = Hash;
+  using key_equal = KeyEqual;
+  using Self =
+      Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher, key_equal>;
+
+ private:
+  static_assert(MaxLoadFactor100 > 10 && MaxLoadFactor100 < 100,
+                "MaxLoadFactor100 needs to be >10 && < 100");
+
+  using WHash = WrapHash<Hash>;
+  using WKeyEqual = WrapKeyEqual<KeyEqual>;
+
+  // configuration defaults
+
+  // make sure we have 8 elements, needed to quickly rehash mInfo
+  static constexpr size_t InitialNumElements = sizeof(uint64_t);
+  static constexpr uint32_t InitialInfoNumBits = 5;
+  static constexpr uint8_t InitialInfoInc = 1U << InitialInfoNumBits;
+  static constexpr size_t InfoMask = InitialInfoInc - 1U;
+  static constexpr uint8_t InitialInfoHashShift = 0;
+  using DataPool = detail::NodeAllocator<value_type, 4, 16384, IsFlat>;
+
+  // type needs to be wider than uint8_t.
+  using InfoType = uint32_t;
+
+  // DataNode ////////////////////////////////////////////////////////
+
+  // Primary template for the data node. We have special implementations for
+  // small and big
+  // objects. For large objects it is assumed that swap() is fairly slow, so we
+  // allocate these
+  // on the heap so swap merely swaps a pointer.
+  template <typename M, bool>
+  class DataNode {};
+
+  // Small: just allocate on the stack.
+  template <typename M>
+  class DataNode<M, true> final {
+   public:
+    template <typename... Args>
+    explicit DataNode(
+        M &ROBIN_HOOD_UNUSED(map) /*unused*/,
+        Args &&... args) noexcept(noexcept(value_type(std::
+                                                          forward<Args>(
+                                                              args)...)))
+        : mData(std::forward<Args>(args)...) {}
+
+    DataNode(
+        M &ROBIN_HOOD_UNUSED(map) /*unused*/,
+        DataNode<M, true>
+            &&n) noexcept(std::is_nothrow_move_constructible<value_type>::value)
+        : mData(std::move(n.mData)) {}
+
+    // doesn't do anything
+    void destroy(M &ROBIN_HOOD_UNUSED(map) /*unused*/) noexcept {}
+    void destroyDoNotDeallocate() noexcept {}
+
+    value_type const *operator->() const noexcept { return &mData; }
+    value_type *operator->() noexcept { return &mData; }
+
+    const value_type &operator*() const noexcept { return mData; }
+
+    value_type &operator*() noexcept { return mData; }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type &>::type
+        getFirst() noexcept {
+      return mData.first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT &>::type getFirst() noexcept {
+      return mData;
+    }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type const &>::type
+        getFirst() const noexcept {
+      return mData.first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT const &>::type getFirst() const
+        noexcept {
+      return mData;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, MT &>::type getSecond() noexcept {
+      return mData.second;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, MT const &>::type getSecond() const
+        noexcept {
+      return mData.second;
+    }
+
+    void swap(DataNode<M, true> &o) noexcept(
+        noexcept(std::declval<value_type>().swap(std::declval<value_type>()))) {
+      mData.swap(o.mData);
+    }
+
+   private:
+    value_type mData;
+  };
+
+  // big object: allocate on heap.
+  template <typename M>
+  class DataNode<M, false> {
+   public:
+    template <typename... Args>
+    explicit DataNode(M &map, Args &&... args) : mData(map.allocate()) {
+      ::new (static_cast<void *>(mData))
+          value_type(std::forward<Args>(args)...);
+    }
+
+    DataNode(M &ROBIN_HOOD_UNUSED(map) /*unused*/,
+             DataNode<M, false> &&n) noexcept : mData(std::move(n.mData)) {}
+
+    void destroy(M &map) noexcept {
+      // don't deallocate, just put it into list of datapool.
+      mData->~value_type();
+      map.deallocate(mData);
+    }
+
+    void destroyDoNotDeallocate() noexcept { mData->~value_type(); }
+
+    value_type const *operator->() const noexcept { return mData; }
+
+    value_type *operator->() noexcept { return mData; }
+
+    const value_type &operator*() const { return *mData; }
+
+    value_type &operator*() { return *mData; }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type &>::type
+        getFirst() noexcept {
+      return mData->first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT &>::type getFirst() noexcept {
+      return *mData;
+    }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type const &>::type
+        getFirst() const noexcept {
+      return mData->first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT const &>::type getFirst() const
+        noexcept {
+      return *mData;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, MT &>::type getSecond() noexcept {
+      return mData->second;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, MT const &>::type getSecond() const
+        noexcept {
+      return mData->second;
+    }
+
+    void swap(DataNode<M, false> &o) noexcept {
+      using std::swap;
+      swap(mData, o.mData);
+    }
+
+   private:
+    value_type *mData;
+  };
+
+  using Node = DataNode<Self, IsFlat>;
+
+  // helpers for insertKeyPrepareEmptySpot: extract first entry (only const
+  // required)
+  ROBIN_HOOD(NODISCARD)
+  key_type const &getFirstConst(Node const &n) const noexcept {
+    return n.getFirst();
+  }
+
+  // in case we have void mapped_type, we are not using a pair, thus we just
+  // route k through.
+  // No need to disable this because it's just not used if not applicable.
+  ROBIN_HOOD(NODISCARD)
+  key_type const &getFirstConst(key_type const &k) const noexcept { return k; }
+
+  // in case we have non-void mapped_type, we have a standard robin_hood::pair
+  template <typename Q = mapped_type>
+  ROBIN_HOOD(NODISCARD)
+  typename std::enable_if<!std::is_void<Q>::value, key_type const &>::type
+      getFirstConst(value_type const &vt) const noexcept {
+    return vt.first;
+  }
+
+  // Cloner //////////////////////////////////////////////////////////
+
+  template <typename M, bool UseMemcpy>
+  struct Cloner;
+
+  // fast path: Just copy data, without allocating anything.
+  template <typename M>
+  struct Cloner<M, true> {
+    void operator()(M const &source, M &target) const {
+      auto const *const src = reinterpret_cast<char const *>(source.mKeyVals);
+      auto *tgt = reinterpret_cast<char *>(target.mKeyVals);
+      auto const numElementsWithBuffer =
+          target.calcNumElementsWithBuffer(target.mMask + 1);
+      std::copy(src, src + target.calcNumBytesTotal(numElementsWithBuffer),
+                tgt);
+    }
+  };
+
+  template <typename M>
+  struct Cloner<M, false> {
+    void operator()(M const &s, M &t) const {
+      auto const numElementsWithBuffer =
+          t.calcNumElementsWithBuffer(t.mMask + 1);
+      std::copy(s.mInfo, s.mInfo + t.calcNumBytesInfo(numElementsWithBuffer),
+                t.mInfo);
+
+      for (size_t i = 0; i < numElementsWithBuffer; ++i) {
+        if (t.mInfo[i]) {
+          ::new (static_cast<void *>(t.mKeyVals + i)) Node(t, *s.mKeyVals[i]);
+        }
+      }
+    }
+  };
+
+  // Destroyer ///////////////////////////////////////////////////////
+
+  template <typename M, bool IsFlatAndTrivial>
+  struct Destroyer {};
+
+  template <typename M>
+  struct Destroyer<M, true> {
+    void nodes(M &m) const noexcept { m.mNumElements = 0; }
+
+    void nodesDoNotDeallocate(M &m) const noexcept { m.mNumElements = 0; }
+  };
+
+  template <typename M>
+  struct Destroyer<M, false> {
+    void nodes(M &m) const noexcept {
+      m.mNumElements = 0;
+      // clear also resets mInfo to 0, that's sometimes not necessary.
+      auto const numElementsWithBuffer =
+          m.calcNumElementsWithBuffer(m.mMask + 1);
+
+      for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+        if (0 != m.mInfo[idx]) {
+          Node &n = m.mKeyVals[idx];
+          n.destroy(m);
+          n.~Node();
+        }
+      }
+    }
+
+    void nodesDoNotDeallocate(M &m) const noexcept {
+      m.mNumElements = 0;
+      // clear also resets mInfo to 0, that's sometimes not necessary.
+      auto const numElementsWithBuffer =
+          m.calcNumElementsWithBuffer(m.mMask + 1);
+      for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+        if (0 != m.mInfo[idx]) {
+          Node &n = m.mKeyVals[idx];
+          n.destroyDoNotDeallocate();
+          n.~Node();
+        }
+      }
+    }
+  };
+
+  // Iter ////////////////////////////////////////////////////////////
+
+  struct fast_forward_tag {};
+
+  // generic iterator for both const_iterator and iterator.
+  template <bool IsConst>
+  // NOLINTNEXTLINE(hicpp-special-member-functions,cppcoreguidelines-special-member-functions)
+  class Iter {
+   private:
+    using NodePtr =
+        typename std::conditional<IsConst, Node const *, Node *>::type;
+
+   public:
+    using difference_type = std::ptrdiff_t;
+    using value_type = typename Self::value_type;
+    using reference = typename std::conditional<IsConst, value_type const &,
+                                                value_type &>::type;
+    using pointer = typename std::conditional<IsConst, value_type const *,
+                                              value_type *>::type;
+    using iterator_category = std::forward_iterator_tag;
+
+    // default constructed iterator can be compared to itself, but WON'T return
+    // true when
+    // compared to end().
+    Iter() = default;
+
+    // Rule of zero: nothing specified. The conversion constructor is only
+    // enabled for
+    // iterator to const_iterator, so it doesn't accidentally work as a copy
+    // ctor.
+
+    // Conversion constructor from iterator to const_iterator.
+    template <bool OtherIsConst, typename = typename std::enable_if<
+                                     IsConst && !OtherIsConst>::type>
+    // NOLINTNEXTLINE(hicpp-explicit-conversions)
+    Iter(Iter<OtherIsConst> const &other) noexcept : mKeyVals(other.mKeyVals),
+                                                     mInfo(other.mInfo) {}
+
+    Iter(NodePtr valPtr, uint8_t const *infoPtr) noexcept : mKeyVals(valPtr),
+                                                            mInfo(infoPtr) {}
+
+    Iter(NodePtr valPtr, uint8_t const *infoPtr,
+         fast_forward_tag ROBIN_HOOD_UNUSED(tag) /*unused*/) noexcept
+        : mKeyVals(valPtr),
+          mInfo(infoPtr) {
+      fastForward();
+    }
+
+    template <bool OtherIsConst, typename = typename std::enable_if<
+                                     IsConst && !OtherIsConst>::type>
+    Iter &operator=(Iter<OtherIsConst> const &other) noexcept {
+      mKeyVals = other.mKeyVals;
+      mInfo = other.mInfo;
+      return *this;
+    }
+
+    // prefix increment. Undefined behavior if we are at end()!
+    Iter &operator++() noexcept {
+      mInfo++;
+      mKeyVals++;
+      fastForward();
+      return *this;
+    }
+
+    Iter operator++(int)noexcept {
+      Iter tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    reference operator*() const { return **mKeyVals; }
+
+    pointer operator->() const { return &**mKeyVals; }
+
+    template <bool O>
+    bool operator==(Iter<O> const &o) const noexcept {
+      return mKeyVals == o.mKeyVals;
+    }
+
+    template <bool O>
+    bool operator!=(Iter<O> const &o) const noexcept {
+      return mKeyVals != o.mKeyVals;
+    }
+
+   private:
+    // fast forward to the next non-free info byte
+    // I've tried a few variants that don't depend on intrinsics, but
+    // unfortunately they are
+    // quite a bit slower than this one. So I've reverted that change again. See
+    // map_benchmark.
+    void fastForward() noexcept {
+      size_t n = 0;
+      while (0U == (n = detail::unaligned_load<size_t>(mInfo))) {
+        mInfo += sizeof(size_t);
+        mKeyVals += sizeof(size_t);
+      }
+#if defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+      // we know for certain that within the next 8 bytes we'll find a non-zero
+      // one.
+      if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint32_t>(mInfo))) {
+        mInfo += 4;
+        mKeyVals += 4;
+      }
+      if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint16_t>(mInfo))) {
+        mInfo += 2;
+        mKeyVals += 2;
+      }
+      if (ROBIN_HOOD_UNLIKELY(0U == *mInfo)) {
+        mInfo += 1;
+        mKeyVals += 1;
+      }
+#else
+#if ROBIN_HOOD(LITTLE_ENDIAN)
+      auto inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8;
+#else
+      auto inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8;
+#endif
+      mInfo += inc;
+      mKeyVals += inc;
+#endif
+    }
+
+    friend class Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher,
+                       key_equal>;
+    NodePtr mKeyVals{nullptr};
+    uint8_t const *mInfo{nullptr};
+  };
+
+  ////////////////////////////////////////////////////////////////////
+
+  // highly performance relevant code.
+  // Lower bits are used for indexing into the array (2^n size)
+  // The upper 1-5 bits need to be a reasonable good hash, to save comparisons.
+  template <typename HashKey>
+  void keyToIdx(HashKey &&key, size_t *idx, InfoType *info) const {
+    // In addition to whatever hash is used, add another mul & shift so we get
+    // better hashing.
+    // This serves as a bad hash prevention, if the given data is
+    // badly mixed.
+    auto h = static_cast<uint64_t>(WHash::operator()(key));
+
+    h *= mHashMultiplier;
+    h ^= h >> 33U;
+
+    // the lower InitialInfoNumBits are reserved for info.
+    *info = mInfoInc + static_cast<InfoType>((h & InfoMask) >> mInfoHashShift);
+    *idx = (static_cast<size_t>(h) >> InitialInfoNumBits) & mMask;
+  }
+
+  // forwards the index by one, wrapping around at the end
+  void next(InfoType *info, size_t *idx) const noexcept {
+    *idx = *idx + 1;
+    *info += mInfoInc;
+  }
+
+  void nextWhileLess(InfoType *info, size_t *idx) const noexcept {
+    // unrolling this by hand did not bring any speedups.
+    while (*info < mInfo[*idx]) {
+      next(info, idx);
+    }
+  }
+
+  // Shift everything up by one element. Tries to move stuff around.
+  void shiftUp(size_t startIdx, size_t const insertion_idx) noexcept(
+      std::is_nothrow_move_assignable<Node>::value) {
+    auto idx = startIdx;
+    ::new (static_cast<void *>(mKeyVals + idx))
+        Node(std::move(mKeyVals[idx - 1]));
+    while (--idx != insertion_idx) {
+      mKeyVals[idx] = std::move(mKeyVals[idx - 1]);
+    }
+
+    idx = startIdx;
+    while (idx != insertion_idx) {
+      ROBIN_HOOD_COUNT(shiftUp)
+      mInfo[idx] = static_cast<uint8_t>(mInfo[idx - 1] + mInfoInc);
+      if (ROBIN_HOOD_UNLIKELY(mInfo[idx] + mInfoInc > 0xFF)) {
+        mMaxNumElementsAllowed = 0;
+      }
+      --idx;
+    }
+  }
+
+  void shiftDown(size_t idx) noexcept(
+      std::is_nothrow_move_assignable<Node>::value) {
+    // until we find one that is either empty or has zero offset.
+    // TODO(martinus) we don't need to move everything, just the last one for
+    // the same
+    // bucket.
+    mKeyVals[idx].destroy(*this);
+
+    // until we find one that is either empty or has zero offset.
+    while (mInfo[idx + 1] >= 2 * mInfoInc) {
+      ROBIN_HOOD_COUNT(shiftDown)
+      mInfo[idx] = static_cast<uint8_t>(mInfo[idx + 1] - mInfoInc);
+      mKeyVals[idx] = std::move(mKeyVals[idx + 1]);
+      ++idx;
+    }
+
+    mInfo[idx] = 0;
+    // don't destroy, we've moved it
+    // mKeyVals[idx].destroy(*this);
+    mKeyVals[idx].~Node();
+  }
+
+  // copy of find(), except that it returns iterator instead of const_iterator.
+  template <typename Other>
+  ROBIN_HOOD(NODISCARD)
+  size_t findIdx(Other const &key) const {
+    size_t idx{};
+    InfoType info{};
+    keyToIdx(key, &idx, &info);
+
+    do {
+      // unrolling this twice gives a bit of a speedup. More unrolling did not
+      // help.
+      if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()(
+                                    key, mKeyVals[idx].getFirst()))) {
+        return idx;
+      }
+      next(&info, &idx);
+      if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()(
+                                    key, mKeyVals[idx].getFirst()))) {
+        return idx;
+      }
+      next(&info, &idx);
+    } while (info <= mInfo[idx]);
+
+    // nothing found!
+    return mMask == 0
+               ? 0
+               : static_cast<size_t>(std::distance(
+                     mKeyVals,
+                     reinterpret_cast_no_cast_align_warning<Node *>(mInfo)));
+  }
+
+  void cloneData(const Table &o) {
+    Cloner<Table, IsFlat && ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(Node)>()(o, *this);
+  }
+
+  // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is
+  // resized.
+  // @return True on success, false if something went wrong
+  void insert_move(Node &&keyval) {
+    // we don't retry, fail if overflowing
+    // don't need to check max num elements
+    if (0 == mMaxNumElementsAllowed && !try_increase_info()) {
+      throwOverflowError();
+    }
+
+    size_t idx{};
+    InfoType info{};
+    keyToIdx(keyval.getFirst(), &idx, &info);
+
+    // skip forward. Use <= because we are certain that the element is not
+    // there.
+    while (info <= mInfo[idx]) {
+      idx = idx + 1;
+      info += mInfoInc;
+    }
+
+    // key not found, so we are now exactly where we want to insert it.
+    auto const insertion_idx = idx;
+    auto const insertion_info = static_cast<uint8_t>(info);
+    if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+      mMaxNumElementsAllowed = 0;
+    }
+
+    // find an empty spot
+    while (0 != mInfo[idx]) {
+      next(&info, &idx);
+    }
+
+    auto &l = mKeyVals[insertion_idx];
+    if (idx == insertion_idx) {
+      ::new (static_cast<void *>(&l)) Node(std::move(keyval));
+    } else {
+      shiftUp(idx, insertion_idx);
+      l = std::move(keyval);
+    }
+
+    // put at empty spot
+    mInfo[insertion_idx] = insertion_info;
+
+    ++mNumElements;
+  }
+
+ public:
+  using iterator = Iter<false>;
+  using const_iterator = Iter<true>;
+
+  Table() noexcept(noexcept(Hash()) && noexcept(KeyEqual()))
+      : WHash(), WKeyEqual() {
+    ROBIN_HOOD_TRACE(this)
+  }
+
+  // Creates an empty hash map. Nothing is allocated yet, this happens at the
+  // first insert.
+  // This tremendously speeds up ctor & dtor of a map that never receives an
+  // element. The
+  // penalty is payed at the first insert, and not before. Lookup of this empty
+  // map works
+  // because everybody points to DummyInfoByte::b. parameter bucket_count is
+  // dictated by the
+  // standard, but we can ignore it.
+  explicit Table(
+      size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/, const Hash &h = Hash{},
+      const KeyEqual &equal = KeyEqual{}) noexcept(noexcept(Hash(h)) &&
+                                                   noexcept(KeyEqual(equal)))
+      : WHash(h), WKeyEqual(equal) {
+    ROBIN_HOOD_TRACE(this)
+  }
+
+  template <typename Iter>
+  Table(Iter first, Iter last,
+        size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0,
+        const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{})
+      : WHash(h), WKeyEqual(equal) {
+    ROBIN_HOOD_TRACE(this)
+    insert(first, last);
+  }
+
+  Table(std::initializer_list<value_type> initlist,
+        size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0,
+        const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{})
+      : WHash(h), WKeyEqual(equal) {
+    ROBIN_HOOD_TRACE(this)
+    insert(initlist.begin(), initlist.end());
+  }
+
+  Table(Table &&o) noexcept : WHash(std::move(static_cast<WHash &>(o))),
+                              WKeyEqual(std::move(static_cast<WKeyEqual &>(o))),
+                              DataPool(std::move(static_cast<DataPool &>(o))) {
+    ROBIN_HOOD_TRACE(this)
+    if (o.mMask) {
+      mHashMultiplier = std::move(o.mHashMultiplier);
+      mKeyVals = std::move(o.mKeyVals);
+      mInfo = std::move(o.mInfo);
+      mNumElements = std::move(o.mNumElements);
+      mMask = std::move(o.mMask);
+      mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+      mInfoInc = std::move(o.mInfoInc);
+      mInfoHashShift = std::move(o.mInfoHashShift);
+      // set other's mask to 0 so its destructor won't do anything
+      o.init();
+    }
+  }
+
+  Table &operator=(Table &&o) noexcept {
+    ROBIN_HOOD_TRACE(this)
+    if (&o != this) {
+      if (o.mMask) {
+        // only move stuff if the other map actually has some data
+        destroy();
+        mHashMultiplier = std::move(o.mHashMultiplier);
+        mKeyVals = std::move(o.mKeyVals);
+        mInfo = std::move(o.mInfo);
+        mNumElements = std::move(o.mNumElements);
+        mMask = std::move(o.mMask);
+        mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+        mInfoInc = std::move(o.mInfoInc);
+        mInfoHashShift = std::move(o.mInfoHashShift);
+        WHash::operator=(std::move(static_cast<WHash &>(o)));
+        WKeyEqual::operator=(std::move(static_cast<WKeyEqual &>(o)));
+        DataPool::operator=(std::move(static_cast<DataPool &>(o)));
+
+        o.init();
+
+      } else {
+        // nothing in the other map => just clear us.
+        clear();
+      }
+    }
+    return *this;
+  }
+
+  Table(const Table &o)
+      : WHash(static_cast<const WHash &>(o)),
+        WKeyEqual(static_cast<const WKeyEqual &>(o)),
+        DataPool(static_cast<const DataPool &>(o)) {
+    ROBIN_HOOD_TRACE(this)
+    if (!o.empty()) {
+      // not empty: create an exact copy. it is also possible to just iterate
+      // through all
+      // elements and insert them, but copying is probably faster.
+
+      auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+      auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+
+      ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                    << numElementsWithBuffer << ")")
+      mHashMultiplier = o.mHashMultiplier;
+      mKeyVals = static_cast<Node *>(
+          detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
+      // no need for calloc because clonData does memcpy
+      mInfo = reinterpret_cast<uint8_t *>(mKeyVals + numElementsWithBuffer);
+      mNumElements = o.mNumElements;
+      mMask = o.mMask;
+      mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+      mInfoInc = o.mInfoInc;
+      mInfoHashShift = o.mInfoHashShift;
+      cloneData(o);
+    }
+  }
+
+  // Creates a copy of the given map. Copy constructor of each entry is used.
+  // Not sure why clang-tidy thinks this doesn't handle self assignment, it does
+  // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+  Table &operator=(Table const &o) {
+    ROBIN_HOOD_TRACE(this)
+    if (&o == this) {
+      // prevent assigning of itself
+      return *this;
+    }
+
+    // we keep using the old allocator and not assign the new one, because we
+    // want to keep
+    // the memory available. when it is the same size.
+    if (o.empty()) {
+      if (0 == mMask) {
+        // nothing to do, we are empty too
+        return *this;
+      }
+
+      // not empty: destroy what we have there
+      // clear also resets mInfo to 0, that's sometimes not necessary.
+      destroy();
+      init();
+      WHash::operator=(static_cast<const WHash &>(o));
+      WKeyEqual::operator=(static_cast<const WKeyEqual &>(o));
+      DataPool::operator=(static_cast<DataPool const &>(o));
+
+      return *this;
+    }
+
+    // clean up old stuff
+    Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+        .nodes(*this);
+
+    if (mMask != o.mMask) {
+      // no luck: we don't have the same array size allocated, so we need to
+      // realloc.
+      if (0 != mMask) {
+        // only deallocate if we actually have data!
+        ROBIN_HOOD_LOG("std::free")
+        std::free(mKeyVals);
+      }
+
+      auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+      auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+      ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                    << numElementsWithBuffer << ")")
+      mKeyVals = static_cast<Node *>(
+          detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
+
+      // no need for calloc here because cloneData performs a memcpy.
+      mInfo = reinterpret_cast<uint8_t *>(mKeyVals + numElementsWithBuffer);
+      // sentinel is set in cloneData
+    }
+    WHash::operator=(static_cast<const WHash &>(o));
+    WKeyEqual::operator=(static_cast<const WKeyEqual &>(o));
+    DataPool::operator=(static_cast<DataPool const &>(o));
+    mHashMultiplier = o.mHashMultiplier;
+    mNumElements = o.mNumElements;
+    mMask = o.mMask;
+    mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+    mInfoInc = o.mInfoInc;
+    mInfoHashShift = o.mInfoHashShift;
+    cloneData(o);
+
+    return *this;
+  }
+
+  // Swaps everything between the two maps.
+  void swap(Table &o) {
+    ROBIN_HOOD_TRACE(this)
+    using std::swap;
+    swap(o, *this);
+  }
+
+  // Clears all data, without resizing.
+  void clear() {
+    ROBIN_HOOD_TRACE(this)
+    if (empty()) {
+      // don't do anything! also important because we don't want to write to
+      // DummyInfoByte::b, even though we would just write 0 to it.
+      return;
+    }
+
+    Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+        .nodes(*this);
+
+    auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+    // clear everything, then set the sentinel again
+    uint8_t const z = 0;
+    std::fill(mInfo, mInfo + calcNumBytesInfo(numElementsWithBuffer), z);
+    mInfo[numElementsWithBuffer] = 1;
+
+    mInfoInc = InitialInfoInc;
+    mInfoHashShift = InitialInfoHashShift;
+  }
+
+  // Destroys the map and all it's contents.
+  ~Table() {
+    ROBIN_HOOD_TRACE(this)
+    destroy();
+  }
+
+  // Checks if both tables contain the same entries. Order is irrelevant.
+  bool operator==(const Table &other) const {
+    ROBIN_HOOD_TRACE(this)
+    if (other.size() != size()) {
+      return false;
+    }
+    for (auto const &otherEntry : other) {
+      if (!has(otherEntry)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool operator!=(const Table &other) const {
+    ROBIN_HOOD_TRACE(this)
+    return !operator==(other);
+  }
+
+  template <typename Q = mapped_type>
+  typename std::enable_if<!std::is_void<Q>::value, Q &>::type operator[](
+      const key_type &key) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::piecewise_construct, std::forward_as_tuple(key),
+                 std::forward_as_tuple());
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] =
+            Node(*this, std::piecewise_construct, std::forward_as_tuple(key),
+                 std::forward_as_tuple());
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+    }
+
+    return mKeyVals[idxAndState.first].getSecond();
+  }
+
+  template <typename Q = mapped_type>
+  typename std::enable_if<!std::is_void<Q>::value, Q &>::type operator[](
+      key_type &&key) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first])) Node(
+            *this, std::piecewise_construct,
+            std::forward_as_tuple(std::move(key)), std::forward_as_tuple());
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] = Node(
+            *this, std::piecewise_construct,
+            std::forward_as_tuple(std::move(key)), std::forward_as_tuple());
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+    }
+
+    return mKeyVals[idxAndState.first].getSecond();
+  }
+
+  template <typename Iter>
+  void insert(Iter first, Iter last) {
+    for (; first != last; ++first) {
+      // value_type ctor needed because this might be called with std::pair's
+      insert(value_type(*first));
+    }
+  }
+
+  void insert(std::initializer_list<value_type> ilist) {
+    for (auto &&vt : ilist) {
+      insert(std::move(vt));
+    }
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args &&... args) {
+    ROBIN_HOOD_TRACE(this)
+    Node n{*this, std::forward<Args>(args)...};
+    auto idxAndState = insertKeyPrepareEmptySpot(getFirstConst(n));
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        n.destroy(*this);
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::move(n));
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] = std::move(n);
+        break;
+
+      case InsertionState::overflow_error:
+        n.destroy(*this);
+        throwOverflowError();
+        break;
+    }
+
+    return std::make_pair(
+        iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+        InsertionState::key_found != idxAndState.second);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const key_type &key, Args &&... args) {
+    return try_emplace_impl(key, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(key_type &&key, Args &&... args) {
+    return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const_iterator hint,
+                                        const key_type &key, Args &&... args) {
+    (void)hint;
+    return try_emplace_impl(key, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const_iterator hint, key_type &&key,
+                                        Args &&... args) {
+    (void)hint;
+    return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(const key_type &key,
+                                             Mapped &&obj) {
+    return insertOrAssignImpl(key, std::forward<Mapped>(obj));
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(key_type &&key, Mapped &&obj) {
+    return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj));
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(const_iterator hint,
+                                             const key_type &key,
+                                             Mapped &&obj) {
+    (void)hint;
+    return insertOrAssignImpl(key, std::forward<Mapped>(obj));
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(const_iterator hint,
+                                             key_type &&key, Mapped &&obj) {
+    (void)hint;
+    return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj));
+  }
+
+  std::pair<iterator, bool> insert(const value_type &keyval) {
+    ROBIN_HOOD_TRACE(this)
+    return emplace(keyval);
+  }
+
+  std::pair<iterator, bool> insert(value_type &&keyval) {
+    return emplace(std::move(keyval));
+  }
+
+  // Returns 1 if key is found, 0 otherwise.
+  size_t count(const key_type &key) const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv != reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      return 1;
+    }
+    return 0;
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<Self_::is_transparent, size_t>::type count(
+      const OtherKey &key) const {
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv != reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      return 1;
+    }
+    return 0;
+  }
+
+  bool contains(const key_type &key) const {  // NOLINT(modernize-use-nodiscard)
+    return 1U == count(key);
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<Self_::is_transparent, bool>::type contains(
+      const OtherKey &key) const {
+    return 1U == count(key);
+  }
+
+  // Returns a reference to the value found for key.
+  // Throws std::out_of_range if element cannot be found
+  template <typename Q = mapped_type>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<!std::is_void<Q>::value, Q &>::type at(
+      key_type const &key) {
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv == reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      doThrow<std::out_of_range>("key not found");
+    }
+    return kv->getSecond();
+  }
+
+  // Returns a reference to the value found for key.
+  // Throws std::out_of_range if element cannot be found
+  template <typename Q = mapped_type>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<!std::is_void<Q>::value, Q const &>::type at(
+      key_type const &key) const {
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv == reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      doThrow<std::out_of_range>("key not found");
+    }
+    return kv->getSecond();
+  }
+
+  const_iterator find(
+      const key_type &key) const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return const_iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey>
+  const_iterator find(const OtherKey &key,
+                      is_transparent_tag /*unused*/) const {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return const_iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  typename std::enable_if<
+      Self_::is_transparent,             // NOLINT(modernize-use-nodiscard)
+      const_iterator>::type              // NOLINT(modernize-use-nodiscard)
+      find(const OtherKey &key) const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return const_iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  iterator find(const key_type &key) {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey>
+  iterator find(const OtherKey &key, is_transparent_tag /*unused*/) {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  typename std::enable_if<Self_::is_transparent, iterator>::type find(
+      const OtherKey &key) {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  iterator begin() {
+    ROBIN_HOOD_TRACE(this)
+    if (empty()) {
+      return end();
+    }
+    return iterator(mKeyVals, mInfo, fast_forward_tag{});
+  }
+  const_iterator begin() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return cbegin();
+  }
+  const_iterator cbegin() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    if (empty()) {
+      return cend();
+    }
+    return const_iterator(mKeyVals, mInfo, fast_forward_tag{});
+  }
+
+  iterator end() {
+    ROBIN_HOOD_TRACE(this)
+    // no need to supply valid info pointer: end() must not be dereferenced, and
+    // only node
+    // pointer is compared.
+    return iterator{reinterpret_cast_no_cast_align_warning<Node *>(mInfo),
+                    nullptr};
+  }
+  const_iterator end() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return cend();
+  }
+  const_iterator cend() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return const_iterator{reinterpret_cast_no_cast_align_warning<Node *>(mInfo),
+                          nullptr};
+  }
+
+  iterator erase(const_iterator pos) {
+    ROBIN_HOOD_TRACE(this)
+    // its safe to perform const cast here
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    return erase(iterator{const_cast<Node *>(pos.mKeyVals),
+                          const_cast<uint8_t *>(pos.mInfo)});
+  }
+
+  // Erases element at pos, returns iterator to the next element.
+  iterator erase(iterator pos) {
+    ROBIN_HOOD_TRACE(this)
+    // we assume that pos always points to a valid entry, and not end().
+    auto const idx = static_cast<size_t>(pos.mKeyVals - mKeyVals);
+
+    shiftDown(idx);
+    --mNumElements;
+
+    if (*pos.mInfo) {
+      // we've backward shifted, return this again
+      return pos;
+    }
+
+    // no backward shift, return next element
+    return ++pos;
+  }
+
+  size_t erase(const key_type &key) {
+    ROBIN_HOOD_TRACE(this)
+    size_t idx{};
+    InfoType info{};
+    keyToIdx(key, &idx, &info);
+
+    // check while info matches with the source idx
+    do {
+      if (info == mInfo[idx] &&
+          WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+        shiftDown(idx);
+        --mNumElements;
+        return 1;
+      }
+      next(&info, &idx);
+    } while (info <= mInfo[idx]);
+
+    // nothing found to delete
+    return 0;
+  }
+
+  // reserves space for the specified number of elements. Makes sure the old
+  // data fits.
+  // exactly the same as reserve(c).
+  void rehash(size_t c) {
+    // forces a reserve
+    reserve(c, true);
+  }
+
+  // reserves space for the specified number of elements. Makes sure the old
+  // data fits.
+  // Exactly the same as rehash(c). Use rehash(0) to shrink to fit.
+  void reserve(size_t c) {
+    // reserve, but don't force rehash
+    reserve(c, false);
+  }
+
+  // If possible reallocates the map to a smaller one. This frees the underlying
+  // table.
+  // Does not do anything if load_factor is too large for decreasing the table's
+  // size.
+  void compact() {
+    ROBIN_HOOD_TRACE(this)
+    auto newSize = InitialNumElements;
+    while (calcMaxNumElementsAllowed(newSize) < mNumElements && newSize != 0) {
+      newSize *= 2;
+    }
+    if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+      throwOverflowError();
+    }
+
+    ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask
+                                           << " + 1")
+
+    // only actually do anything when the new size is bigger than the old one.
+    // This prevents to
+    // continuously allocate for each reserve() call.
+    if (newSize < mMask + 1) {
+      rehashPowerOfTwo(newSize, true);
+    }
+  }
+
+  size_type size() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return mNumElements;
+  }
+
+  size_type max_size() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return static_cast<size_type>(-1);
+  }
+
+  ROBIN_HOOD(NODISCARD) bool empty() const noexcept {
+    ROBIN_HOOD_TRACE(this)
+    return 0 == mNumElements;
+  }
+
+  float max_load_factor() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return MaxLoadFactor100 / 100.0F;
+  }
+
+  // Average number of elements per bucket. Since we allow only 1 per bucket
+  float load_factor() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return static_cast<float>(size()) / static_cast<float>(mMask + 1);
+  }
+
+  ROBIN_HOOD(NODISCARD) size_t mask() const noexcept {
+    ROBIN_HOOD_TRACE(this)
+    return mMask;
+  }
+
+  ROBIN_HOOD(NODISCARD)
+  size_t calcMaxNumElementsAllowed(size_t maxElements) const noexcept {
+    if (ROBIN_HOOD_LIKELY(maxElements <=
+                          (std::numeric_limits<size_t>::max)() / 100)) {
+      return maxElements * MaxLoadFactor100 / 100;
+    }
+
+    // we might be a bit inprecise, but since maxElements is quite large that
+    // doesn't matter
+    return (maxElements / 100) * MaxLoadFactor100;
+  }
+
+  ROBIN_HOOD(NODISCARD)
+  size_t calcNumBytesInfo(size_t numElements) const noexcept {
+    // we add a uint64_t, which houses the sentinel (first byte) and padding so
+    // we can load
+    // 64bit types.
+    return numElements + sizeof(uint64_t);
+  }
+
+  ROBIN_HOOD(NODISCARD)
+  size_t calcNumElementsWithBuffer(size_t numElements) const noexcept {
+    auto maxNumElementsAllowed = calcMaxNumElementsAllowed(numElements);
+    return numElements +
+           (std::min)(maxNumElementsAllowed, (static_cast<size_t>(0xFF)));
+  }
+
+  // calculation only allowed for 2^n values
+  ROBIN_HOOD(NODISCARD) size_t calcNumBytesTotal(size_t numElements) const {
+#if ROBIN_HOOD(BITNESS) == 64
+    return numElements * sizeof(Node) + calcNumBytesInfo(numElements);
+#else
+    // make sure we're doing 64bit operations, so we are at least safe against
+    // 32bit overflows.
+    auto const ne = static_cast<uint64_t>(numElements);
+    auto const s = static_cast<uint64_t>(sizeof(Node));
+    auto const infos = static_cast<uint64_t>(calcNumBytesInfo(numElements));
+
+    auto const total64 = ne * s + infos;
+    auto const total = static_cast<size_t>(total64);
+
+    if (ROBIN_HOOD_UNLIKELY(static_cast<uint64_t>(total) != total64)) {
+      throwOverflowError();
+    }
+    return total;
+#endif
+  }
+
+ private:
+  template <typename Q = mapped_type>
+  ROBIN_HOOD(NODISCARD)
+  typename std::enable_if<!std::is_void<Q>::value, bool>::type
+      has(const value_type &e) const {
+    ROBIN_HOOD_TRACE(this)
+    auto it = find(e.first);
+    return it != end() && it->second == e.second;
+  }
+
+  template <typename Q = mapped_type>
+  ROBIN_HOOD(NODISCARD)
+  typename std::enable_if<std::is_void<Q>::value, bool>::type
+      has(const value_type &e) const {
+    ROBIN_HOOD_TRACE(this)
+    return find(e) != end();
+  }
+
+  void reserve(size_t c, bool forceRehash) {
+    ROBIN_HOOD_TRACE(this)
+    auto const minElementsAllowed = (std::max)(c, mNumElements);
+    auto newSize = InitialNumElements;
+    while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed &&
+           newSize != 0) {
+      newSize *= 2;
+    }
+    if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+      throwOverflowError();
+    }
+
+    ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask
+                                           << " + 1")
+
+    // only actually do anything when the new size is bigger than the old one.
+    // This prevents to
+    // continuously allocate for each reserve() call.
+    if (forceRehash || newSize > mMask + 1) {
+      rehashPowerOfTwo(newSize, false);
+    }
+  }
+
+  // reserves space for at least the specified number of elements.
+  // only works if numBuckets if power of two
+  // True on success, false otherwise
+  void rehashPowerOfTwo(size_t numBuckets, bool forceFree) {
+    ROBIN_HOOD_TRACE(this)
+
+    Node *const oldKeyVals = mKeyVals;
+    uint8_t const *const oldInfo = mInfo;
+
+    const size_t oldMaxElementsWithBuffer =
+        calcNumElementsWithBuffer(mMask + 1);
+
+    // resize operation: move stuff
+    initData(numBuckets);
+    if (oldMaxElementsWithBuffer > 1) {
+      for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) {
+        if (oldInfo[i] != 0) {
+          // might throw an exception, which is really bad since we are in the
+          // middle of
+          // moving stuff.
+          insert_move(std::move(oldKeyVals[i]));
+          // destroy the node but DON'T destroy the data.
+          oldKeyVals[i].~Node();
+        }
+      }
+
+      // this check is not necessary as it's guarded by the previous if, but it
+      // helps
+      // silence g++'s overeager "attempt to free a non-heap object 'map'
+      // [-Werror=free-nonheap-object]" warning.
+      if (oldKeyVals !=
+          reinterpret_cast_no_cast_align_warning<Node *>(&mMask)) {
+        // don't destroy old data: put it into the pool instead
+        if (forceFree) {
+          std::free(oldKeyVals);
+        } else {
+          DataPool::addOrFree(oldKeyVals,
+                              calcNumBytesTotal(oldMaxElementsWithBuffer));
+        }
+      }
+    }
+  }
+
+  ROBIN_HOOD(NOINLINE) void throwOverflowError() const {
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+    throw std::overflow_error("robin_hood::map overflow");
+#else
+    abort();
+#endif
+  }
+
+  template <typename OtherKey, typename... Args>
+  std::pair<iterator, bool> try_emplace_impl(OtherKey &&key, Args &&... args) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Args>(args)...));
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] =
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Args>(args)...));
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+        break;
+    }
+
+    return std::make_pair(
+        iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+        InsertionState::key_found != idxAndState.second);
+  }
+
+  template <typename OtherKey, typename Mapped>
+  std::pair<iterator, bool> insertOrAssignImpl(OtherKey &&key, Mapped &&obj) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        mKeyVals[idxAndState.first].getSecond() = std::forward<Mapped>(obj);
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Mapped>(obj)));
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] =
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Mapped>(obj)));
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+        break;
+    }
+
+    return std::make_pair(
+        iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+        InsertionState::key_found != idxAndState.second);
+  }
+
+  void initData(size_t max_elements) {
+    mNumElements = 0;
+    mMask = max_elements - 1;
+    mMaxNumElementsAllowed = calcMaxNumElementsAllowed(max_elements);
+
+    auto const numElementsWithBuffer = calcNumElementsWithBuffer(max_elements);
+
+    // calloc also zeroes everything
+    auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+    ROBIN_HOOD_LOG("std::calloc " << numBytesTotal << " = calcNumBytesTotal("
+                                  << numElementsWithBuffer << ")")
+    mKeyVals = reinterpret_cast<Node *>(
+        detail::assertNotNull<std::bad_alloc>(std::calloc(1, numBytesTotal)));
+    mInfo = reinterpret_cast<uint8_t *>(mKeyVals + numElementsWithBuffer);
+
+    // set sentinel
+    mInfo[numElementsWithBuffer] = 1;
+
+    mInfoInc = InitialInfoInc;
+    mInfoHashShift = InitialInfoHashShift;
+  }
+
+  enum class InsertionState {
+    overflow_error,
+    key_found,
+    new_node,
+    overwrite_node
+  };
+
+  // Finds key, and if not already present prepares a spot where to pot the key
+  // & value.
+  // This potentially shifts nodes out of the way, updates mInfo and number of
+  // inserted
+  // elements, so the only operation left to do is create/assign a new node at
+  // that spot.
+  template <typename OtherKey>
+  std::pair<size_t, InsertionState> insertKeyPrepareEmptySpot(OtherKey &&key) {
+    for (int i = 0; i < 256; ++i) {
+      size_t idx{};
+      InfoType info{};
+      keyToIdx(key, &idx, &info);
+      nextWhileLess(&info, &idx);
+
+      // while we potentially have a match
+      while (info == mInfo[idx]) {
+        if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+          // key already exists, do NOT insert.
+          // see http://en.cppreference.com/w/cpp/container/unordered_map/insert
+          return std::make_pair(idx, InsertionState::key_found);
+        }
+        next(&info, &idx);
+      }
+
+      // unlikely that this evaluates to true
+      if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) {
+        if (!increase_size()) {
+          return std::make_pair(size_t(0), InsertionState::overflow_error);
+        }
+        continue;
+      }
+
+      // key not found, so we are now exactly where we want to insert it.
+      auto const insertion_idx = idx;
+      auto const insertion_info = info;
+      if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+        mMaxNumElementsAllowed = 0;
+      }
+
+      // find an empty spot
+      while (0 != mInfo[idx]) {
+        next(&info, &idx);
+      }
+
+      if (idx != insertion_idx) {
+        shiftUp(idx, insertion_idx);
+      }
+      // put at empty spot
+      mInfo[insertion_idx] = static_cast<uint8_t>(insertion_info);
+      ++mNumElements;
+      return std::make_pair(
+          insertion_idx, idx == insertion_idx ? InsertionState::new_node
+                                              : InsertionState::overwrite_node);
+    }
+
+    // enough attempts failed, so finally give up.
+    return std::make_pair(size_t(0), InsertionState::overflow_error);
+  }
+
+  bool try_increase_info() {
+    ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements
+                               << ", maxNumElementsAllowed="
+                               << calcMaxNumElementsAllowed(mMask + 1))
+    if (mInfoInc <= 2) {
+      // need to be > 2 so that shift works (otherwise undefined behavior!)
+      return false;
+    }
+    // we got space left, try to make info smaller
+    mInfoInc = static_cast<uint8_t>(mInfoInc >> 1U);
+
+    // remove one bit of the hash, leaving more space for the distance info.
+    // This is extremely fast because we can operate on 8 bytes at once.
+    ++mInfoHashShift;
+    auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+
+    for (size_t i = 0; i < numElementsWithBuffer; i += 8) {
+      auto val = unaligned_load<uint64_t>(mInfo + i);
+      val = (val >> 1U) & UINT64_C(0x7f7f7f7f7f7f7f7f);
+      std::memcpy(mInfo + i, &val, sizeof(val));
+    }
+    // update sentinel, which might have been cleared out!
+    mInfo[numElementsWithBuffer] = 1;
+
+    mMaxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+    return true;
+  }
+
+  // True if resize was possible, false otherwise
+  bool increase_size() {
+    // nothing allocated yet? just allocate InitialNumElements
+    if (0 == mMask) {
+      initData(InitialNumElements);
+      return true;
+    }
+
+    auto const maxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+    if (mNumElements < maxNumElementsAllowed && try_increase_info()) {
+      return true;
+    }
+
+    ROBIN_HOOD_LOG("mNumElements="
+                   << mNumElements
+                   << ", maxNumElementsAllowed=" << maxNumElementsAllowed
+                   << ", load=" << (static_cast<double>(mNumElements) * 100.0 /
+                                    (static_cast<double>(mMask) + 1)))
+
+    nextHashMultiplier();
+    if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) {
+      // we have to resize, even though there would still be plenty of space
+      // left!
+      // Try to rehash instead. Delete freed memory so we don't steadyily
+      // increase mem in case
+      // we have to rehash a few times
+      rehashPowerOfTwo(mMask + 1, true);
+    } else {
+      // Each resize use a different hash so we don't so easily overflow.
+      // Make sure we only have odd numbers, so that the multiplication is
+      // reversible!
+      rehashPowerOfTwo((mMask + 1) * 2, false);
+    }
+    return true;
+  }
+
+  void nextHashMultiplier() {
+    // adding an *even* number, so that the multiplier will always stay odd.
+    // This is necessary
+    // so that the hash stays a mixing function (and thus doesn't have any
+    // information loss).
+    mHashMultiplier += UINT64_C(0xc4ceb9fe1a85ec54);
+  }
+
+  void destroy() {
+    if (0 == mMask) {
+      // don't deallocate!
+      return;
+    }
+
+    Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+        .nodesDoNotDeallocate(*this);
+
+    // This protection against not deleting mMask shouldn't be needed as it's
+    // sufficiently
+    // protected with the 0==mMask check, but I have this anyways because g++ 7
+    // otherwise
+    // reports a compile error: attempt to free a non-heap object 'fm'
+    // [-Werror=free-nonheap-object]
+    if (mKeyVals != reinterpret_cast_no_cast_align_warning<Node *>(&mMask)) {
+      ROBIN_HOOD_LOG("std::free")
+      std::free(mKeyVals);
+    }
+  }
+
+  void init() noexcept {
+    mKeyVals = reinterpret_cast_no_cast_align_warning<Node *>(&mMask);
+    mInfo = reinterpret_cast<uint8_t *>(&mMask);
+    mNumElements = 0;
+    mMask = 0;
+    mMaxNumElementsAllowed = 0;
+    mInfoInc = InitialInfoInc;
+    mInfoHashShift = InitialInfoHashShift;
+  }
+
+  // members are sorted so no padding occurs
+  uint64_t mHashMultiplier = UINT64_C(0xc4ceb9fe1a85ec53);  // 8 byte  8
+  Node *mKeyVals =
+      reinterpret_cast_no_cast_align_warning<Node *>(&mMask);  // 8 byte 16
+  uint8_t *mInfo = reinterpret_cast<uint8_t *>(&mMask);        // 8 byte 24
+  size_t mNumElements = 0;                                     // 8 byte 32
+  size_t mMask = 0;                                            // 8 byte 40
+  size_t mMaxNumElementsAllowed = 0;                           // 8 byte 48
+  InfoType mInfoInc = InitialInfoInc;                          // 4 byte 52
+  InfoType mInfoHashShift = InitialInfoHashShift;              // 4 byte 56
+  // 16 byte 56 if NodeAllocator
+};
+
+}  // namespace detail
+
+// map
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_flat_map =
+    detail::Table<true, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_node_map =
+    detail::Table<false, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_map = detail::Table<
+    sizeof(robin_hood::pair<Key, T>) <= sizeof(size_t) * 6 &&
+        std::is_nothrow_move_constructible<robin_hood::pair<Key, T>>::value &&
+        std::is_nothrow_move_assignable<robin_hood::pair<Key, T>>::value,
+    MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+// set
+
+template <typename Key, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_flat_set =
+    detail::Table<true, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_node_set =
+    detail::Table<false, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_set =
+    detail::Table<sizeof(Key) <= sizeof(size_t) * 6 &&
+                      std::is_nothrow_move_constructible<Key>::value &&
+                      std::is_nothrow_move_assignable<Key>::value,
+                  MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+}  // namespace robin_hood
+
+#endif
diff --git a/paddle/fluid/extension/include/ext_dispatch.h b/paddle/fluid/extension/include/ext_dispatch.h
index eed736046496f3af799c82f4f9236a3fc88450fc..9b3e199708adc93356c214df3be217f67d2e8949 100644
--- a/paddle/fluid/extension/include/ext_dispatch.h
+++ b/paddle/fluid/extension/include/ext_dispatch.h
@@ -47,6 +47,22 @@ namespace paddle {
     }                                                                     \
   }()
 
+#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...)                   \
+  [&] {                                                                        \
+    const auto& __dtype__ = TYPE;                                              \
+    switch (__dtype__) {                                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,           \
+                           __VA_ARGS__)                                        \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,          \
+                           __VA_ARGS__)                                        \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT16, paddle::float16, \
+                           __VA_ARGS__)                                        \
+      default:                                                                 \
+        PD_THROW("function " #NAME " is not implemented for data type `",      \
+                 ::paddle::ToString(__dtype__), "`");                          \
+    }                                                                          \
+  }()
+
 ///////// Integral Dispatch Marco ///////////
 
 #define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)                           \
@@ -68,6 +84,22 @@ namespace paddle {
     }                                                                         \
   }()
 
+///////// Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                         \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
+                           ::paddle::complex64, __VA_ARGS__)               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
+                           ::paddle::complex128, __VA_ARGS__)              \
+      default:                                                             \
+        PD_THROW("function " #NAME " is not implemented for data type `" + \
+                 ::paddle::ToString(__dtype__) + "`");                     \
+    }                                                                      \
+  }()
+
 ///////// Floating and Integral Dispatch Marco ///////////
 
 #define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)              \
@@ -93,6 +125,55 @@ namespace paddle {
     }                                                                         \
   }()
 
+///////// Floating and Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)            \
+  [&] {                                                                    \
+    const auto& __dtype__ = TYPE;                                          \
+    switch (__dtype__) {                                                   \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,       \
+                           __VA_ARGS__)                                    \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,      \
+                           __VA_ARGS__)                                    \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,            \
+                           ::paddle::complex64, __VA_ARGS__)               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,           \
+                           ::paddle::complex128, __VA_ARGS__)              \
+      default:                                                             \
+        PD_THROW("function " #NAME " is not implemented for data type `" + \
+                 ::paddle::ToString(__dtype__) + "`");                     \
+    }                                                                      \
+  }()
+
+///////// Floating, Integral and Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)  \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,               \
+                           ::paddle::complex64, __VA_ARGS__)                  \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,              \
+                           ::paddle::complex128, __VA_ARGS__)                 \
+      default:                                                                \
+        PD_THROW("function " #NAME " is not implemented for data type `" +    \
+                 ::paddle::ToString(__dtype__) + "`");                        \
+    }                                                                         \
+  }()
+
 // TODO(chenweihang): Add more Marcos in the future if needed
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/ext_dtype.h b/paddle/fluid/extension/include/ext_dtype.h
index 46c4bac23606449dd2ba68b25818fd58d88f6204..3890631a6f8a9e99948e32cdd3cb8c1e00c2de75 100644
--- a/paddle/fluid/extension/include/ext_dtype.h
+++ b/paddle/fluid/extension/include/ext_dtype.h
@@ -16,10 +16,17 @@ limitations under the License. */
 #include <cstdint>
 #include <string>
 
+#include "complex128.h"     // NOLINT
+#include "complex64.h"      // NOLINT
 #include "ext_exception.h"  // NOLINT
+#include "float16.h"        // NOLINT
 
 namespace paddle {
 
+using complex64 = paddle::platform::complex64;
+using complex128 = paddle::platform::complex128;
+using float16 = paddle::platform::float16;
+
 enum class DataType {
   BOOL,
   INT8,
@@ -27,8 +34,11 @@ enum class DataType {
   INT16,
   INT32,
   INT64,
+  FLOAT16,
   FLOAT32,
   FLOAT64,
+  COMPLEX64,
+  COMPLEX128,
   // TODO(JiabinYang) support more data types if needed.
 };
 
@@ -46,24 +56,33 @@ inline std::string ToString(DataType dtype) {
       return "int32_t";
     case DataType::INT64:
       return "int64_t";
+    case DataType::FLOAT16:
+      return "float16";
     case DataType::FLOAT32:
       return "float";
     case DataType::FLOAT64:
       return "double";
+    case DataType::COMPLEX64:
+      return "complex64";
+    case DataType::COMPLEX128:
+      return "complex128";
     default:
       PD_THROW("Unsupported paddle enum data type.");
   }
 }
 
-#define PD_FOR_EACH_DATA_TYPE(_) \
-  _(bool, DataType::BOOL)        \
-  _(int8_t, DataType::INT8)      \
-  _(uint8_t, DataType::UINT8)    \
-  _(int16_t, DataType::INT16)    \
-  _(int, DataType::INT32)        \
-  _(int64_t, DataType::INT64)    \
-  _(float, DataType::FLOAT32)    \
-  _(double, DataType::FLOAT64)
+#define PD_FOR_EACH_DATA_TYPE(_)    \
+  _(bool, DataType::BOOL)           \
+  _(int8_t, DataType::INT8)         \
+  _(uint8_t, DataType::UINT8)       \
+  _(int16_t, DataType::INT16)       \
+  _(int, DataType::INT32)           \
+  _(int64_t, DataType::INT64)       \
+  _(float16, DataType::FLOAT16)     \
+  _(float, DataType::FLOAT32)       \
+  _(double, DataType::FLOAT64)      \
+  _(complex64, DataType::COMPLEX64) \
+  _(complex128, DataType::COMPLEX128)
 
 template <paddle::DataType T>
 struct DataTypeToCPPType;
diff --git a/paddle/fluid/extension/include/ext_op_meta_info.h b/paddle/fluid/extension/include/ext_op_meta_info.h
index a3b9a4c491033db80f4c10829c0ee61b50387b25..c400164c7543da9878d0fb51a6f239dfaff5beff 100644
--- a/paddle/fluid/extension/include/ext_op_meta_info.h
+++ b/paddle/fluid/extension/include/ext_op_meta_info.h
@@ -56,38 +56,55 @@ using Tensor = paddle::Tensor;
 
 ///////////////// Util Define and Function ////////////////
 
-inline std::string Grad(const std::string& var_name) {
+constexpr char kGradTensorSuffix[] = "@GRAD";
+constexpr char kTensorVectorSuffix[] = "@VECTOR";
+
+// Used for Construct Grad Tensor name
+inline std::string Grad(const std::string& t_name) {
+  std::string result;
+  result.reserve(t_name.size() + 5U);
+  result += t_name;
+  result += kGradTensorSuffix;
+  return result;
+}
+
+// Used for Construct std::vector<Tensor> name
+inline std::string Vec(const std::string& t_name) {
   std::string result;
-  result.reserve(var_name.size() + 5U);
-  result += var_name;
-  result += "@GRAD";
+  result.reserve(t_name.size() + 7U);
+  result += t_name;
+  result += kTensorVectorSuffix;
   return result;
 }
 
 ////////////////////// Kernel Function (PD_KERNEL) ////////////////////////
 
 // Record Op kernel core function
-using KernelFunc = std::vector<Tensor> (*)(std::vector<Tensor> inputs,
-                                           std::vector<boost::any> attrs);
-
-#define PD_SPECIALIZE_ComputeCallHelper(attr_type)                          \
-  template <typename... Tail>                                               \
-  struct ComputeCallHelper<attr_type, Tail...> {                            \
-    template <int in_idx, int attr_idx, typename... PreviousArgs>           \
-    static Return Compute(std::vector<Tensor> inputs,                       \
-                          std::vector<boost::any> attrs,                    \
-                          const PreviousArgs&... pargs) {                   \
-      try {                                                                 \
-        attr_type arg = boost::any_cast<attr_type>(attrs[attr_idx]);        \
-        return ComputeCallHelper<Tail...>::template Compute<in_idx,         \
-                                                            attr_idx + 1>(  \
-            inputs, attrs, pargs..., arg);                                  \
-      } catch (boost::bad_any_cast&) {                                      \
-        PD_THROW(                                                           \
-            "Attribute cast error in custom operator. Expected " #attr_type \
-            " value.");                                                     \
-      }                                                                     \
-    }                                                                       \
+using KernelFunc =
+    std::vector<Tensor> (*)(const std::vector<Tensor>& inputs,
+                            const std::vector<std::vector<Tensor>>& vec_inputs,
+                            const std::vector<boost::any>& attrs);
+
+#define PD_SPECIALIZE_ComputeCallHelper(attr_type)                            \
+  template <typename... Tail>                                                 \
+  struct ComputeCallHelper<attr_type, Tail...> {                              \
+    template <int in_idx, int vec_in_idx, int attr_idx,                       \
+              typename... PreviousArgs>                                       \
+    static Return Compute(const std::vector<Tensor>& inputs,                  \
+                          const std::vector<std::vector<Tensor>>& vec_inputs, \
+                          const std::vector<boost::any>& attrs,               \
+                          const PreviousArgs&... pargs) {                     \
+      try {                                                                   \
+        attr_type arg = boost::any_cast<attr_type>(attrs[attr_idx]);          \
+        return ComputeCallHelper<Tail...>::template Compute<                  \
+            in_idx, vec_in_idx, attr_idx + 1>(inputs, vec_inputs, attrs,      \
+                                              pargs..., arg);                 \
+      } catch (boost::bad_any_cast&) {                                        \
+        PD_THROW(                                                             \
+            "Attribute cast error in custom operator. Expected " #attr_type   \
+            " value.");                                                       \
+      }                                                                       \
+    }                                                                         \
   }
 
 template <typename T>
@@ -98,31 +115,64 @@ struct KernelFuncImpl;
 
 template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
-  static Return Compute(std::vector<Tensor> inputs,
-                        std::vector<boost::any> attrs) {
-    return ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0>(
-        inputs, attrs);
+  static Return Compute(const std::vector<Tensor>& inputs,
+                        const std::vector<std::vector<Tensor>>& vec_inputs,
+                        const std::vector<boost::any>& attrs) {
+    return ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0, 0>(
+        inputs, vec_inputs, attrs);
   }
 
  private:
   template <typename... RemainingArgs>
   struct ComputeCallHelper;
 
-  // for Tensor input
   template <typename... Tail>
   struct ComputeCallHelper<const Tensor&, Tail...> {
-    template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<Tensor> inputs,
-                          std::vector<boost::any> attrs,
+    template <int in_idx, int vec_in_idx, int attr_idx,
+              typename... PreviousArgs>
+    static Return Compute(const std::vector<Tensor>& inputs,
+                          const std::vector<std::vector<Tensor>>& vec_inputs,
+                          const std::vector<boost::any>& attrs,
                           const PreviousArgs&... pargs) {
-      static_assert(attr_idx == 0,
-                    "Input tensor should appear before attributes.");
       const Tensor& arg = inputs[in_idx];
-      return ComputeCallHelper<Tail...>::template Compute<in_idx + 1, attr_idx>(
-          inputs, attrs, pargs..., arg);
+      return ComputeCallHelper<Tail...>::template Compute<in_idx + 1,
+                                                          vec_in_idx, attr_idx>(
+          inputs, vec_inputs, attrs, pargs..., arg);
     }
   };
 
+  template <typename... Tail>
+  struct ComputeCallHelper<const std::vector<Tensor>&, Tail...> {
+    template <int in_idx, int vec_in_idx, int attr_idx,
+              typename... PreviousArgs>
+    static Return Compute(const std::vector<Tensor>& inputs,
+                          const std::vector<std::vector<Tensor>>& vec_inputs,
+                          const std::vector<boost::any>& attrs,
+                          const PreviousArgs&... pargs) {
+      const std::vector<Tensor>& arg = vec_inputs[vec_in_idx];
+      return ComputeCallHelper<Tail...>::template Compute<
+          in_idx, vec_in_idx + 1, attr_idx>(inputs, vec_inputs, attrs, pargs...,
+                                            arg);
+    }
+  };
+
+  PD_SPECIALIZE_ComputeCallHelper(const bool&);
+  PD_SPECIALIZE_ComputeCallHelper(const int&);
+  PD_SPECIALIZE_ComputeCallHelper(const float&);
+  PD_SPECIALIZE_ComputeCallHelper(const int64_t&);
+  PD_SPECIALIZE_ComputeCallHelper(const std::string&);
+  PD_SPECIALIZE_ComputeCallHelper(const std::vector<int>&);
+  PD_SPECIALIZE_ComputeCallHelper(const std::vector<float>&);
+  PD_SPECIALIZE_ComputeCallHelper(const std::vector<int64_t>&);
+  PD_SPECIALIZE_ComputeCallHelper(const std::vector<std::string>&);
+  // TODO(chenweihang): support other attribute type if needed.
+  // Why not support other attribute type here?
+  // - boost::blank, std::vector<bool> and std::vector<double>
+  //   are not used in op
+  // - BlockDesc* and std::vector<BlockDesc*> are used in framework
+
+  // NOTE(chenweihang): Used to be compatible with the 2.0.1 released
+  // interface, and will be deprecated in the future
   PD_SPECIALIZE_ComputeCallHelper(bool);
   PD_SPECIALIZE_ComputeCallHelper(int);
   PD_SPECIALIZE_ComputeCallHelper(float);
@@ -132,17 +182,15 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   PD_SPECIALIZE_ComputeCallHelper(std::vector<float>);
   PD_SPECIALIZE_ComputeCallHelper(std::vector<int64_t>);
   PD_SPECIALIZE_ComputeCallHelper(std::vector<std::string>);
-  // TODO(chenweihang): support other attribute type if needed.
-  // Why not support other attribute type here?
-  // - boost::blank, std::vector<bool> and std::vector<double>
-  //   are not used in op
-  // - BlockDesc* and std::vector<BlockDesc*> are used in framework
+
   // end: base template
   template <typename T>
   struct ComputeCallHelper<TypeTag<T>> {
-    template <int in_idx, int attr_idx>
-    static Return Compute(std::vector<Tensor> inputs,
-                          std::vector<boost::any> attrs, const Args&... args) {
+    template <int in_idx, int vec_in_idx, int attr_idx>
+    static Return Compute(const std::vector<Tensor>& inputs,
+                          const std::vector<std::vector<Tensor>>& vec_inputs,
+                          const std::vector<boost::any>& attrs,
+                          const Args&... args) {
       return impl_fn(args...);
     }
   };
@@ -155,40 +203,118 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
 
 // Record Op infershape core function
 using InferShapeFunc = std::vector<std::vector<int64_t>> (*)(
-    std::vector<std::vector<int64_t>> input_shapes);
+    const std::vector<std::vector<int64_t>>& input_shapes,
+    const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
+    const std::vector<boost::any>& attrs);
+
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)              \
+  template <typename... Tail>                                                 \
+  struct InferShapeCallHelper<input_type, Tail...> {                          \
+    template <int in_idx, int vec_in_idx, int attr_idx,                       \
+              typename... PreviousArgs>                                       \
+    static Return InferShape(                                                 \
+        const std::vector<std::vector<int64_t>>& input_shapes,                \
+        const std::vector<std::vector<std::vector<int64_t>>>&                 \
+            vec_input_shapes,                                                 \
+        const std::vector<boost::any>& attrs, const PreviousArgs&... pargs) { \
+      input_type arg = input_shapes[in_idx];                                  \
+      return InferShapeCallHelper<Tail...>::template InferShape<              \
+          in_idx + 1, vec_in_idx, attr_idx>(input_shapes, vec_input_shapes,   \
+                                            attrs, pargs..., arg);            \
+    }                                                                         \
+  }
+
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type)             \
+  template <typename... Tail>                                                 \
+  struct InferShapeCallHelper<input_type, Tail...> {                          \
+    template <int in_idx, int vec_in_idx, int attr_idx,                       \
+              typename... PreviousArgs>                                       \
+    static Return InferShape(                                                 \
+        const std::vector<std::vector<int64_t>>& input_shapes,                \
+        const std::vector<std::vector<std::vector<int64_t>>>&                 \
+            vec_input_shapes,                                                 \
+        const std::vector<boost::any>& attrs, const PreviousArgs&... pargs) { \
+      input_type arg = vec_input_shapes[vec_in_idx];                          \
+      return InferShapeCallHelper<Tail...>::template InferShape<              \
+          in_idx, vec_in_idx + 1, attr_idx>(input_shapes, vec_input_shapes,   \
+                                            attrs, pargs..., arg);            \
+    }                                                                         \
+  }
+
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type)                \
+  template <typename... Tail>                                                 \
+  struct InferShapeCallHelper<attr_type, Tail...> {                           \
+    template <int in_idx, int vec_in_idx, int attr_idx,                       \
+              typename... PreviousArgs>                                       \
+    static Return InferShape(                                                 \
+        const std::vector<std::vector<int64_t>>& input_shapes,                \
+        const std::vector<std::vector<std::vector<int64_t>>>&                 \
+            vec_input_shapes,                                                 \
+        const std::vector<boost::any>& attrs, const PreviousArgs&... pargs) { \
+      try {                                                                   \
+        attr_type arg = boost::any_cast<attr_type>(attrs[attr_idx]);          \
+        return InferShapeCallHelper<Tail...>::template InferShape<            \
+            in_idx, vec_in_idx, attr_idx + 1>(input_shapes, vec_input_shapes, \
+                                              attrs, pargs..., arg);          \
+      } catch (boost::bad_any_cast&) {                                        \
+        PD_THROW(                                                             \
+            "Attribute cast error in custom operator InferShapeFn. "          \
+            "Expected " #attr_type                                            \
+            " value. InferShapeFn's attribute list must be exactly same as "  \
+            "Forward "                                                        \
+            "KernelFn's attribute list except std::vector<int64_t> "          \
+            "attribute.");                                                    \
+      }                                                                       \
+    }                                                                         \
+  }
 
 template <typename F, F f>
 struct InferShapeFuncImpl;
 
 template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
-  static Return InferShape(std::vector<std::vector<int64_t>> input_shapes) {
-    return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<0>(
-        input_shapes);
+  static Return InferShape(
+      const std::vector<std::vector<int64_t>>& input_shapes,
+      const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
+      const std::vector<boost::any>& attrs) {
+    return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<
+        0, 0, 0>(input_shapes, vec_input_shapes, attrs);
   }
 
  private:
   template <typename... RemainingArgs>
   struct InferShapeCallHelper;
 
-  // only one type input: std::vector<int64_t>
-  template <typename... Tail>
-  struct InferShapeCallHelper<std::vector<int64_t>, Tail...> {
-    template <int in_idx, typename... PreviousArgs>
-    static Return InferShape(std::vector<std::vector<int64_t>> input_shapes,
-                             const PreviousArgs&... pargs) {
-      std::vector<int64_t> arg = input_shapes[in_idx];
-      return InferShapeCallHelper<Tail...>::template InferShape<in_idx + 1>(
-          input_shapes, pargs..., arg);
-    }
-  };
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(const std::vector<int64_t>&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(
+      const std::vector<std::vector<int64_t>>&);
+
+  // NOTE(chenweihang): Used to be compatible with the 2.0.1 released
+  // interface, and will be deprecated in the future
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(std::vector<int64_t>);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(
+      std::vector<std::vector<int64_t>>);
+
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const bool&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const int&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const float&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const int64_t&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::string&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector<int>&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector<float>&);
+  PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector<std::string>&);
+  // NOTE(chenweihang): InferShape can't support std::vector<int64_t> attr type,
+  // because the input type is std::vector<int64_t>, only can use one rule to
+  // parse std::vector<int64_t> parameter
 
   // end: base template
   template <typename T>
   struct InferShapeCallHelper<TypeTag<T>> {
-    template <int in_idx>
-    static Return InferShape(std::vector<std::vector<int64_t>> input_shapes,
-                             const Args&... args) {
+    template <int in_idx, int vec_in_idx, int attr_idx>
+    static Return InferShape(
+        const std::vector<std::vector<int64_t>>& input_shapes,
+        const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
+        const std::vector<boost::any>& attrs, const Args&... args) {
       return impl_fn(args...);
     }
   };
@@ -200,41 +326,73 @@ struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
 /////////////// InferDataType Function (PD_INFER_DTYPE) ///////////////
 
 // Record Op Infer dtype core function
-using InferDtypeFunc =
-    std::vector<DataType> (*)(std::vector<DataType> input_dtypes);
+using InferDtypeFunc = std::vector<DataType> (*)(
+    const std::vector<DataType>& input_dtypes,
+    const std::vector<std::vector<DataType>>& vec_input_dtypes);
+
+#define PD_SPECIALIZE_InferDtypeCallHelper_TO_DTYPE(input_type)              \
+  template <typename... Tail>                                                \
+  struct InferDtypeCallHelper<input_type, Tail...> {                         \
+    template <int in_idx, int vec_in_idx, typename... PreviousArgs>          \
+    static Return InferDtype(                                                \
+        const std::vector<DataType>& input_dtypes,                           \
+        const std::vector<std::vector<DataType>>& vec_input_dtypes,          \
+        const PreviousArgs&... pargs) {                                      \
+      input_type arg = input_dtypes[in_idx];                                 \
+      return InferDtypeCallHelper<Tail...>::template InferDtype<in_idx + 1,  \
+                                                                vec_in_idx>( \
+          input_dtypes, vec_input_dtypes, pargs..., arg);                    \
+    }                                                                        \
+  }
+
+#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type)           \
+  template <typename... Tail>                                               \
+  struct InferDtypeCallHelper<input_type, Tail...> {                        \
+    template <int in_idx, int vec_in_idx, typename... PreviousArgs>         \
+    static Return InferDtype(                                               \
+        const std::vector<DataType>& input_dtypes,                          \
+        const std::vector<std::vector<DataType>>& vec_input_dtypes,         \
+        const PreviousArgs&... pargs) {                                     \
+      input_type arg = vec_input_dtypes[vec_in_idx];                        \
+      return InferDtypeCallHelper<Tail...>::template InferDtype<            \
+          in_idx, vec_in_idx + 1>(input_dtypes, vec_input_dtypes, pargs..., \
+                                  arg);                                     \
+    }                                                                       \
+  }
 
 template <typename F, F f>
 struct InferDtypeFuncImpl;
 
 template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct InferDtypeFuncImpl<Return (*)(Args...), impl_fn> {
-  static Return InferDtype(std::vector<DataType> input_dtypes) {
-    return InferDtypeCallHelper<Args..., TypeTag<int>>::template InferDtype<0>(
-        input_dtypes);
+  static Return InferDtype(
+      const std::vector<DataType>& input_dtypes,
+      const std::vector<std::vector<DataType>>& vec_input_dtypes) {
+    return InferDtypeCallHelper<Args..., TypeTag<int>>::template InferDtype<0,
+                                                                            0>(
+        input_dtypes, vec_input_dtypes);
   }
 
  private:
   template <typename... RemainingArgs>
   struct InferDtypeCallHelper;
 
-  // Only one type input now: DataType
-  template <typename... Tail>
-  struct InferDtypeCallHelper<DataType, Tail...> {
-    template <int in_idx, typename... PreviousArgs>
-    static Return InferDtype(std::vector<DataType> input_dtypes,
-                             const PreviousArgs&... pargs) {
-      DataType arg = input_dtypes[in_idx];
-      return InferDtypeCallHelper<Tail...>::template InferDtype<in_idx + 1>(
-          input_dtypes, pargs..., arg);
-    }
-  };
+  PD_SPECIALIZE_InferDtypeCallHelper_TO_DTYPE(const DataType&);
+  PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(const std::vector<DataType>&);
+
+  // NOTE(chenweihang): Used to be compatible with the 2.0.1 released
+  // interface, and will be deprecated in the future
+  PD_SPECIALIZE_InferDtypeCallHelper_TO_DTYPE(DataType);
+  PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(std::vector<DataType>);
 
   // end: base template
   template <typename T>
   struct InferDtypeCallHelper<TypeTag<T>> {
-    template <int in_idx>
-    static Return InferDtype(std::vector<DataType> input_dtypes,
-                             const Args&... args) {
+    template <int in_idx, int vec_in_idx>
+    static Return InferDtype(
+        const std::vector<DataType>& input_dtypes,
+        const std::vector<std::vector<DataType>>& vec_input_dtypes,
+        const Args&... args) {
       return impl_fn(args...);
     }
   };
diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h
index be492a6d5535d17df579ec4fec8dd76d266a3029..fa91490e6cd8af7c3a4ff4b2b3013b519c59aa2a 100644
--- a/paddle/fluid/extension/include/ext_tensor.h
+++ b/paddle/fluid/extension/include/ext_tensor.h
@@ -52,6 +52,9 @@ class PD_DLL_DECL Tensor {
   /// \brief Construct a Tensor on target Place for CustomOp.
   /// Generally it's only used for user to create Tensor.
   explicit Tensor(const PlaceType& place);
+  /// \brief Construct a Tensor on target Place with shape for CustomOp.
+  /// Generally it's only used for user to create Tensor.
+  Tensor(const PlaceType& place, const std::vector<int64_t>& shape);
   /// \brief Reset the shape of the tensor.
   /// Generally it's only used for the input tensor.
   /// Reshape must be called before calling
@@ -110,6 +113,9 @@ class PD_DLL_DECL Tensor {
   /// \brief Cast datatype from one to another
   Tensor cast(const DataType& target_type) const;
 
+  /// \brief Check Tensor is initialized
+  bool is_initialized() const;
+
 #ifdef PADDLE_WITH_CUDA
   /// \bref Get current stream of Tensor
   cudaStream_t stream() const;
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index 4434a3bf5941ffdd668c31c737493162e67bea5e..8b2f7cc5bf13c99b80cd365f5c449f3d3b68bdc5 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -13,11 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/extension/include/ext_tensor.h"
+
 #include <utility>
+
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
@@ -97,13 +102,23 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
 
 void Tensor::reshape(const std::vector<int64_t> &shape) {
   GET_CASTED_TENSOR
-  tensor->Resize(framework::make_ddim(shape));
+  auto new_dim = framework::make_ddim(shape);
+  tensor->Resize(new_dim);
 }
 
 Tensor::Tensor(const PlaceType &place)
     : tensor_(std::make_shared<framework::LoDTensor>()),
       place_(place),
       stream_(StreamWrapper()) {}
+
+Tensor::Tensor(const PlaceType &place, const std::vector<int64_t> &shape)
+    : tensor_(std::make_shared<framework::LoDTensor>()),
+      place_(place),
+      stream_(StreamWrapper()) {
+  GET_CASTED_TENSOR
+  tensor->Resize(framework::make_ddim(shape));
+}
+
 template <typename T>
 T *Tensor::mutable_data(const PlaceType &place) {
   place_ = place;
@@ -162,6 +177,12 @@ DataType Tensor::type() const {
     return DataType::FLOAT64;
   } else if (type == framework::proto::VarType::BOOL) {
     return DataType::BOOL;
+  } else if (type == framework::proto::VarType::COMPLEX64) {
+    return DataType::COMPLEX64;
+  } else if (type == framework::proto::VarType::COMPLEX128) {
+    return DataType::COMPLEX128;
+  } else if (type == framework::proto::VarType::FP16) {
+    return DataType::FLOAT16;
   }
   // TODO(JiabinYang) Support more dtype here
   return DataType::FLOAT32;
@@ -217,6 +238,12 @@ template PD_DLL_DECL Tensor
 Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
 Tensor::copy_to<bool>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
+    const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
+    const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
 
 template PD_DLL_DECL float *Tensor::data<float>() const;
 template PD_DLL_DECL double *Tensor::data<double>() const;
@@ -226,6 +253,12 @@ template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
 template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
 template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
 template PD_DLL_DECL bool *Tensor::data<bool>() const;
+template PD_DLL_DECL paddle::platform::complex64 *
+Tensor::data<paddle::platform::complex64>() const;
+template PD_DLL_DECL paddle::platform::complex128 *
+Tensor::data<paddle::platform::complex128>() const;
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::data<paddle::platform::float16>() const;
 
 template PD_DLL_DECL float *Tensor::mutable_data<float>();
 template PD_DLL_DECL double *Tensor::mutable_data<double>();
@@ -235,6 +268,12 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
 template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
+template PD_DLL_DECL paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>();
+template PD_DLL_DECL paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>();
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>();
 
 template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
 template PD_DLL_DECL double *Tensor::mutable_data<double>(
@@ -250,6 +289,12 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
     const PlaceType &place);
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
 
 std::vector<int64_t> Tensor::shape() const {
   GET_CASTED_TENSOR
@@ -310,6 +355,21 @@ Tensor Tensor::cast(const DataType &target_type) const {
       framework::VisitDataType(
           dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
       break;
+    case framework::proto::VarType::COMPLEX64:
+      framework::VisitDataType(
+          dst_type,
+          CastDataType<paddle::platform::complex64>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::COMPLEX128:
+      framework::VisitDataType(dst_type,
+                               CastDataType<paddle::platform::complex128>(
+                                   *tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::FP16:
+      framework::VisitDataType(
+          dst_type,
+          CastDataType<paddle::platform::float16>(*tensor, rlt_tensor_, ctx));
+      break;
     // TODO(JiabinYang) Support more dtype here
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
@@ -324,6 +384,15 @@ int64_t Tensor::size() const {
   return tensor->numel();
 }
 
+bool Tensor::is_initialized() const {
+  GET_CASTED_TENSOR;
+  if (tensor->IsInitialized()) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
 #ifdef PADDLE_WITH_CUDA
 cudaStream_t Tensor::stream() const {
   if (!stream_.IsStreamSet()) {
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 43bbc06787e9baf89dd059487531c4ecb6fd6f6a..4644e674ba4853f1ad5e4710c441d6bc73906635 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -100,8 +100,16 @@ if (WITH_GPU)
 endif()
 cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits)
 
+set(BRPC_DEPS "")
+if(WITH_PSLIB OR WITH_PSCORE)
+    set(BRPC_DEPS brpc)
+    if(WITH_PSLIB_BRPC)
+        set(BRPC_DEPS pslib_brpc)
+    endif()
+endif()
+
 cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits)
-cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope)
+cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope ${BRPC_DEPS})
 cc_test(device_worker_test SRCS device_worker_test.cc DEPS device_worker)
 
 cc_library(scope_pool SRCS scope_pool.cc DEPS scope)
@@ -191,13 +199,15 @@ if(WITH_PYTHON)
   py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
 #Generate an empty \
     #__init__.py to make framework_py_proto as a valid python module.
+  add_custom_target(fleet_proto_init ALL  
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py 
+  )
   add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto)
+  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-      COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
       COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
@@ -207,8 +217,6 @@ if(WITH_PYTHON)
     string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/")
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
-	  COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-	  COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py
           COMMAND copy /Y *.py ${proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
@@ -217,6 +225,12 @@ if(WITH_PYTHON)
   endif(NOT WIN32)
 endif()
 
+if (WITH_PSCORE)
+  add_custom_target(index_dataset_proto_init ALL DEPENDS fleet_proto_init index_dataset_py_proto
+    COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+    COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.")
+endif(WITH_PSCORE)
+
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
@@ -237,9 +251,16 @@ if(WITH_DISTRIBUTE)
     fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
     lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS}
     graph_to_program_pass variable_helper data_feed_proto timer monitor
-    heter_service_proto pslib_brpc)
+    heter_service_proto ${BRPC_DEP})
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+        set(DISTRIBUTE_COMPILE_FLAGS
+                "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+    endif()
     set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   elseif(WITH_PSCORE)
     cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
             dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
@@ -274,7 +295,7 @@ elseif(WITH_PSLIB)
   pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
   device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
   lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method
-  graph_to_program_pass variable_helper timer monitor pslib_brpc )
+  graph_to_program_pass variable_helper timer monitor ${BRPC_DEP})
 else()
   cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
   dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
@@ -295,8 +316,14 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         fast_threaded_ssa_graph_executor variable_helper)
 
 cc_library(executor_cache SRCS executor_cache.cc DEPS executor)
-cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
-    conditional_block_op executor)
+if(WITH_PSCORE)
+    get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+    cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
+        conditional_block_op executor ${RPC_DEPS})
+else()
+    cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
+        conditional_block_op executor)
+endif()
 cc_library(prune SRCS prune.cc DEPS framework_proto boost)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
@@ -346,97 +373,20 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 
 configure_file(commit.h.in commit.h)
 
+# Adapt to custom op mechanism: Include the header files related to the data type
+# to avoid exposing the path of the underlying file
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
+
 cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
 cc_library(op_meta_info SRCS ../extension/src/ext_op_meta_info.cc DEPS custom_tensor)
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper custom_tensor op_meta_info)
 cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
 
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
-
 set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
-# Old custom op extension mechanism related, will be removed in 2.1.0
-cc_library(paddle_framework_shared
-    SHARED SRCS executor.cc operator.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc
-    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
-    DEPS ${FLUID_FRAMEWORK_MODULES})
-get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-set_target_properties(paddle_framework_shared PROPERTIES OUTPUT_NAME paddle_framework)
-target_link_libraries(paddle_framework_shared ${os_dependency_modules})
-
-if (LINUX)
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.so
-      CACHE INTERNAL "Fluid framework lib")
-endif()
-
-if (WIN32)
-  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
-  endif()
-  set(FLUID_FRAMEWORK_IMPORT_LIB
-      ${paddle_framework_lib_path}/paddle_framework.lib
-      CACHE INTERNAL "Fluid framework lib")
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${paddle_framework_lib_path}/paddle_framework.dll
-      CACHE INTERNAL "Fluid framework dll")
-endif()
-
-if(APPLE)
-  set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib
-      CACHE INTERNAL "Fluid framework lib")
-endif()
 if(WITH_TESTING AND TEST selected_rows_test)
   set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
-
-# New custom op extension mechanism related
-
-# if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
-set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
-
-set(PADDLE_CUSTOM_OP_SRCS
-    ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc
-    ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc)
-set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE)
-
-cc_library(paddle_custom_op_shared
-    SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES})
-
-get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op)
-target_link_libraries(paddle_custom_op_shared ${os_dependency_modules})
-
-if (LINUX)
-  set(PADDLE_CUSTOM_OP_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_custom_op.so
-      CACHE INTERNAL "Paddle custom op lib")
-endif()
-
-if (WIN32)
-  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
-  endif()
-  set(PADDLE_CUSTOM_OP_IMPORT_LIB
-      ${paddle_custom_op_lib_path}/paddle_custom_op.lib
-      CACHE INTERNAL "Paddle custom op import lib")
-  set(PADDLE_CUSTOM_OP_SHARED_LIB
-      ${paddle_custom_op_lib_path}/paddle_custom_op.dll
-      CACHE INTERNAL "Paddle custom op dll")
-endif()
-
-if(APPLE)
-  set(PADDLE_CUSTOM_OP_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/paddle_custom_op.dylib
-      CACHE INTERNAL "Paddle custom op lib")
-endif()
diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc
deleted file mode 100644
index 5e73c5cc23afa46506d03d96c893d55592f572de..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/c/c_api.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/c/c_api.h"
-
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-
-extern "C" {
-
-paddle::framework::OpInfoMap &PD_GetOpInfoMap() {
-  return paddle::framework::OpInfoMap::Instance();
-}
-
-void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool) {
-  paddle::platform::DeviceContextPool::SetPool(pool);
-}
-
-std::vector<std::string> PD_GetGradOpDescStrs(
-    const paddle::framework::OpDesc &op_desc,
-    const std::unordered_set<std::string> &no_grad_set,
-    std::unordered_map<std::string, std::string> *grad_to_var,
-    const std::vector<paddle::framework::BlockDesc *> &grad_block) {
-  auto &op_info = PD_GetOpInfoMap().Get(op_desc.Type());
-  std::vector<std::string> ret;
-  if (op_info.grad_op_maker_) {
-    auto grad_op_descs =
-        op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, grad_block);
-    size_t op_num = grad_op_descs.size();
-    ret.resize(op_num);
-    for (size_t i = 0; i < op_num; ++i) {
-      PADDLE_ENFORCE_EQ(
-          grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true,
-          paddle::platform::errors::Unavailable(
-              "Cannot serialize operator desc message."));
-    }
-  }
-  return ret;
-}
-
-}  // end extern "C"
diff --git a/paddle/fluid/framework/c/c_api.h b/paddle/fluid/framework/c/c_api.h
deleted file mode 100644
index a9ec402f381e43b51887b6467d8d1baccf98ad37..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/c/c_api.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* copyright (c) 2019 paddlepaddle authors. all rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class OpInfoMap;
-}  // namespace framework
-namespace platform {
-class DeviceContextPool;
-}  // namespace platform
-}  // namespace paddle
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// C-API to get global OpInfo map.
-paddle::framework::OpInfoMap &PD_GetOpInfoMap();
-
-// C-API to init global DeviceContextPool from outside.
-void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool);
-
-// C-API to serialize the grad op protocol message to a binary string.
-std::vector<std::string> PD_GetGradOpDescStrs(
-    const paddle::framework::OpDesc &op_desc,
-    const std::unordered_set<std::string> &no_grad_set,
-    std::unordered_map<std::string, std::string> *grad_to_var,
-    const std::vector<paddle::framework::BlockDesc *> &grad_block);
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 66e28bb83ce3e42d0b7358bf462cb98e70617fe4..c4b833ec94c2940c6a655c2477845666ee543d7c 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -27,9 +27,7 @@ limitations under the License. */
 
 #include "paddle/fluid/extension/include/ext_tensor.h"
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/c/c_api.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -63,6 +61,11 @@ inline bool IsGradVar(const std::string& var_name) {
   return var_name.rfind(suffix) != std::string::npos;
 }
 
+inline bool IsDuplicableVar(const std::string& var_name) {
+  std::string suffix = kTensorVectorSuffix;
+  return var_name.rfind(suffix) != std::string::npos;
+}
+
 inline std::string NoGrad(const std::string& var_name) {
   std::string suffix = kGradVarSuffix;
   return var_name.substr(0, var_name.size() - kGradVarSuffixSize);
@@ -103,19 +106,47 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                           const std::vector<std::string>& attrs) {
   VLOG(1) << "Custom Operator: Start run KernelFunc.";
   std::vector<paddle::Tensor> custom_ins;
+  std::vector<std::vector<paddle::Tensor>> custom_vec_ins;
   for (auto& in_name : inputs) {
     VLOG(1) << "Custom Operator: input name - " << in_name;
-    auto* x = ctx.Input<Tensor>(in_name);
-    PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound(
-                                   "Input tensor (%s) is nullptr.", in_name));
-    PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
-                      platform::errors::InvalidArgument(
-                          "Input tensor (%s) is not initialized."));
-    auto custom_in = paddle::Tensor(
-        CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place()));
-    CustomTensorUtils::ShareDataFrom(static_cast<const void*>(x), custom_in);
-    CustomTensorUtils::SetTensorCurrentStream(&custom_in, ctx.GetPlace());
-    custom_ins.emplace_back(custom_in);
+    if (detail::IsDuplicableVar(in_name)) {
+      // return const std::vector<const Tensor*>
+      auto vec_x = ctx.MultiInput<Tensor>(in_name);
+      PADDLE_ENFORCE_NE(vec_x.empty(), true,
+                        platform::errors::NotFound(
+                            "Input vector<tensor> (%s) is empty.", in_name));
+      std::vector<paddle::Tensor> custom_vec_in;
+      for (size_t i = 0; i < vec_x.size(); ++i) {
+        auto* x = vec_x[i];
+        PADDLE_ENFORCE_NOT_NULL(
+            x, platform::errors::NotFound(
+                   "The %d-th tensor in input vector<tensor> (%s) is nullptr.",
+                   i, in_name));
+        PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
+                          platform::errors::InvalidArgument(
+                              "The %d-th tensor in input vector<tensor> (%s) "
+                              "is not initialized.",
+                              i, in_name));
+        auto custom_t = paddle::Tensor(
+            CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place()));
+        CustomTensorUtils::ShareDataFrom(static_cast<const void*>(x), custom_t);
+        CustomTensorUtils::SetTensorCurrentStream(&custom_t, ctx.GetPlace());
+        custom_vec_in.emplace_back(custom_t);
+      }
+      custom_vec_ins.emplace_back(custom_vec_in);
+    } else {
+      auto* x = ctx.Input<Tensor>(in_name);
+      PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound(
+                                     "Input tensor (%s) is nullptr.", in_name));
+      PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
+                        platform::errors::InvalidArgument(
+                            "Input tensor (%s) is not initialized.", in_name));
+      auto custom_in = paddle::Tensor(
+          CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place()));
+      CustomTensorUtils::ShareDataFrom(static_cast<const void*>(x), custom_in);
+      CustomTensorUtils::SetTensorCurrentStream(&custom_in, ctx.GetPlace());
+      custom_ins.emplace_back(custom_in);
+    }
   }
 
   std::vector<boost::any> custom_attrs;
@@ -146,21 +177,41 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
           "Unsupported `%s` type value as custom attribute now. "
           "Supported data types include `bool`, `int`, `float`, "
           "`int64_t`, `std::string`, `std::vector<int>`, "
-          "`std::vector<float>`, `std::vector<int64_t>, "
+          "`std::vector<float>`, `std::vector<int64_t>`, "
           "`std::vector<std::string>`, Please check whether "
           "the attribute data type and data type string are matched.",
           attr_type_str));
     }
   }
 
-  VLOG(1) << "Run ComputeFunc.";
+  VLOG(1) << "Custom Operator: Run ComputeFunc.";
   try {
-    auto outs = func(custom_ins, custom_attrs);
+    auto outs = func(custom_ins, custom_vec_ins, custom_attrs);
 
     VLOG(1) << "Custom Operator: Share outputs into ExecutionContext.";
     for (size_t i = 0; i < outputs.size(); ++i) {
-      auto* true_out = ctx.Output<Tensor>(outputs[i]);
-      CustomTensorUtils::ShareDataTo(outs.at(i), true_out);
+      auto out_name = outputs[i];
+      if (detail::IsDuplicableVar(out_name)) {
+        PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL,
+                       platform::errors::PreconditionNotMet(
+                           "If custom operator's outputs contains `paddle::Vec("
+                           ")` type, "
+                           "it only can hold one output."));
+        auto vec_true_outs = ctx.MultiOutput<Tensor>(out_name);
+        PADDLE_ENFORCE_EQ(
+            vec_true_outs.size(), outs.size(),
+            platform::errors::InvalidArgument(
+                "The number of element in custom operator outputs is wrong, "
+                "expected contains %d Tensors, but actually contains %d "
+                "Tensors.",
+                vec_true_outs.size(), outs.size()));
+        for (size_t j = 0; j < vec_true_outs.size(); ++j) {
+          CustomTensorUtils::ShareDataTo(outs.at(j), vec_true_outs.at(j));
+        }
+      } else {
+        auto* true_out = ctx.Output<Tensor>(out_name);
+        CustomTensorUtils::ShareDataTo(outs.at(i), true_out);
+      }
     }
   } catch (platform::EnforceNotMet& exception) {
     throw std::move(exception);
@@ -195,7 +246,7 @@ class CustomOperator : public OperatorWithKernel {
    * it can only be determined at runtime.
    */
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
+      const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(proto::VarType::RAW, ctx.GetPlace());
   }
 
@@ -206,7 +257,7 @@ class CustomOperator : public OperatorWithKernel {
    */
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
-      const OpKernelType& expected_kernel_type) {
+      const OpKernelType& expected_kernel_type) const override {
     return OpKernelType(expected_kernel_type.data_type_,
                         expected_kernel_type.place_, tensor.layout());
   }
@@ -221,10 +272,20 @@ class CustomOpMaker : public OpProtoAndCheckerMaker {
 
   void Make() override {
     for (auto& in_name : inputs_) {
-      AddInput(in_name, "The input " + in_name + "of Custom operator.");
+      if (detail::IsDuplicableVar(in_name)) {
+        AddInput(in_name, "The input " + in_name + "of Custom operator.")
+            .AsDuplicable();
+      } else {
+        AddInput(in_name, "The input " + in_name + "of Custom operator.");
+      }
     }
     for (auto& out_name : outputs_) {
-      AddOutput(out_name, "The output " + out_name + "of Custom Operator.");
+      if (detail::IsDuplicableVar(out_name)) {
+        AddOutput(out_name, "The output " + out_name + "of Custom Operator.")
+            .AsDuplicable();
+      } else {
+        AddOutput(out_name, "The output " + out_name + "of Custom Operator.");
+      }
     }
     for (auto& attr : attrs_) {
       auto attr_name_and_type = detail::ParseAttrStr(attr);
@@ -265,7 +326,7 @@ class CustomOpMaker : public OpProtoAndCheckerMaker {
             "Unsupported `%s` type value as custom attribute now. "
             "Supported data types include `bool`, `int`, `float`, "
             "`int64_t`, `std::string`, `std::vector<int>`, "
-            "`std::vector<float>`, `std::vector<int64_t>, "
+            "`std::vector<float>`, `std::vector<int64_t>`, "
             "`std::vector<std::string>`, Please check whether "
             "the attribute data type and data type string are matched.",
             attr_type_str));
@@ -331,7 +392,13 @@ class CustomGradOpMaker<OpDesc> : public SingleGradOpMaker<OpDesc> {
     }
     for (auto& out_name : outputs_) {
       VLOG(1) << "Custom Operator: GradOpDescMaker - output: " << out_name;
-      grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name)));
+      if (detail::IsDuplicableVar(out_name)) {
+        grad_op->SetOutput(out_name,
+                           this->InputGrad(detail::NoGrad(out_name),
+                                           /*drop_empty_grad=*/false));
+      } else {
+        grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name)));
+      }
     }
     grad_op->SetAttrMap(this->Attrs());
   }
@@ -493,9 +560,9 @@ void RegisterOperatorWithMetaInfo(
           platform::errors::Unavailable(
               "Your custom operator contains multiple inputs. "
               "We only allow a custom operator that contains only one input "
-              "and "
-              "only one output without setting the InferShapeFn. At this time, "
-              "the input shape will be directly set to the output shape.\n"
+              "and only one output without setting the InferShapeFn. "
+              "At this time, the input shape will be directly set to "
+              "the output shape.\n"
               "Please set the InferShapeFn of custom "
               "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
       PADDLE_ENFORCE_EQ(
@@ -503,9 +570,9 @@ void RegisterOperatorWithMetaInfo(
           platform::errors::Unavailable(
               "Your custom operator contains multiple outputs. "
               "We only allow a custom operator that contains only one input "
-              "and "
-              "only one output without setting the InferShapeFn. At this time, "
-              "the input shape will be directly set to the output shape.\n"
+              "and only one output without setting the InferShapeFn. "
+              "At this time, the input shape will be directly set to "
+              "the output shape.\n"
               "Please set the InferShapeFn of custom "
               "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
 
@@ -513,24 +580,91 @@ void RegisterOperatorWithMetaInfo(
       ctx->ShareDim(op_inputs[0], op_outputs[0]);
     };
   } else {
-    info.infer_shape_ = [op_inputs, op_outputs,
+    info.infer_shape_ = [op_inputs, op_outputs, op_attrs,
                          infer_shape_func](InferShapeContext* ctx) {
       std::vector<std::vector<int64_t>> input_shapes;
+      std::vector<std::vector<std::vector<int64_t>>> vec_input_shapes;
 
       VLOG(1) << "Custom Operator: InferShape - get input ddim.";
       for (auto& in_name : op_inputs) {
-        OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom");
-        auto ddim = ctx->GetInputDim(in_name);
-        input_shapes.emplace_back(framework::vectorize(ddim));
+        if (detail::IsDuplicableVar(in_name)) {
+          OP_INOUT_CHECK(ctx->HasInputs(in_name), "Input", in_name, "Custom");
+          auto vec_ddim = ctx->GetInputsDim(in_name);
+          std::vector<std::vector<int64_t>> vec_shape;
+          vec_shape.reserve(vec_ddim.size());
+          std::transform(vec_ddim.begin(), vec_ddim.end(),
+                         std::back_inserter(vec_shape),
+                         [&](const DDim& ddim) -> std::vector<int64_t> {
+                           return framework::vectorize(ddim);
+                         });
+          vec_input_shapes.emplace_back(vec_shape);
+        } else {
+          OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom");
+          auto ddim = ctx->GetInputDim(in_name);
+          input_shapes.emplace_back(framework::vectorize(ddim));
+        }
+      }
+
+      std::vector<boost::any> custom_attrs;
+      for (auto& attr_str : op_attrs) {
+        auto attr_name_and_type = detail::ParseAttrStr(attr_str);
+        auto attr_name = attr_name_and_type[0];
+        auto attr_type_str = attr_name_and_type[1];
+        if (attr_type_str == "bool") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<bool>(attr_name));
+        } else if (attr_type_str == "int") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<int>(attr_name));
+        } else if (attr_type_str == "float") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<float>(attr_name));
+        } else if (attr_type_str == "int64_t") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<int64_t>(attr_name));
+        } else if (attr_type_str == "std::string") {
+          custom_attrs.emplace_back(ctx->Attrs().Get<std::string>(attr_name));
+        } else if (attr_type_str == "std::vector<int>") {
+          custom_attrs.emplace_back(
+              ctx->Attrs().Get<std::vector<int>>(attr_name));
+        } else if (attr_type_str == "std::vector<float>") {
+          custom_attrs.emplace_back(
+              ctx->Attrs().Get<std::vector<float>>(attr_name));
+        } else if (attr_type_str == "std::vector<int64_t>") {
+          // NOTE(chenweihang): InferShape can't support std::vector<int64_t>
+          // attr type, because the input type is std::vector<int64_t>, only
+          // can use one rule to parse std::vector<int64_t> parameter
+          continue;
+        } else if (attr_type_str == "std::vector<std::string>") {
+          custom_attrs.emplace_back(
+              ctx->Attrs().Get<std::vector<std::string>>(attr_name));
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported `%s` type value as custom attribute now. "
+              "Supported data types include `bool`, `int`, `float`, "
+              "`int64_t`, `std::string`, `std::vector<int>`, "
+              "`std::vector<float>`, `std::vector<std::string>`, "
+              "Please check whether the attribute data type and "
+              "data type string are matched.",
+              attr_type_str));
+        }
       }
 
       VLOG(1) << "Custom Operator: InferShape - calc output ddim.";
-      auto output_shapes = infer_shape_func(input_shapes);
+      auto output_shapes =
+          infer_shape_func(input_shapes, vec_input_shapes, custom_attrs);
 
       VLOG(1) << "Custom Operator: InferShape - set output ddim.";
       for (size_t i = 0; i < op_outputs.size(); ++i) {
-        ctx->SetOutputDim(op_outputs[i],
-                          framework::make_ddim(output_shapes[i]));
+        auto out_name = op_outputs[i];
+        if (detail::IsDuplicableVar(out_name)) {
+          std::vector<DDim> vec_ddim;
+          vec_ddim.reserve(output_shapes.size());
+          std::transform(output_shapes.begin(), output_shapes.end(),
+                         std::back_inserter(vec_ddim),
+                         [&](const std::vector<int64_t>& shape) -> DDim {
+                           return framework::make_ddim(shape);
+                         });
+          ctx->SetOutputsDim(out_name, vec_ddim);
+        } else {
+          ctx->SetOutputDim(out_name, framework::make_ddim(output_shapes[i]));
+        }
       }
     };
   }
@@ -544,9 +678,9 @@ void RegisterOperatorWithMetaInfo(
           platform::errors::Unavailable(
               "Your custom operator contains multiple inputs. "
               "We only allow a custom operator that contains only one input "
-              "and "
-              "only one output without setting the InferDtypeFn. At this time, "
-              "the input dtype will be directly set to the output dtype.\n"
+              "and only one output without setting the InferDtypeFn. "
+              "At this time, the input dtype will be directly set to "
+              "the output dtype.\n"
               "Please set the InferDtypeFn of custom "
               "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
       PADDLE_ENFORCE_EQ(
@@ -554,9 +688,9 @@ void RegisterOperatorWithMetaInfo(
           platform::errors::Unavailable(
               "Your custom operator contains multiple outputs. "
               "We only allow a custom operator that contains only one input "
-              "and "
-              "only one output without setting the InferDtypeFn. At this time, "
-              "the input dtype will be directly set to the output dtype.\n"
+              "and only one output without setting the InferDtypeFn. "
+              "At this time, the input dtype will be directly set to "
+              "the output dtype.\n"
               "Please set the InferDtypeFn of custom "
               "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
 
@@ -568,22 +702,42 @@ void RegisterOperatorWithMetaInfo(
     info.infer_var_type_ = [op_inputs, op_outputs,
                             infer_dtype_func](InferVarTypeContext* ctx) {
       std::vector<DataType> input_dtypes;
+      std::vector<std::vector<DataType>> vec_input_dtypes;
 
       VLOG(1) << "Custom Operator: InferDtype - get input dtype.";
       for (auto& in_name : op_inputs) {
-        auto dtype = ctx->GetInputDataType(in_name);
-        input_dtypes.emplace_back(
-            CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype));
+        if (detail::IsDuplicableVar(in_name)) {
+          std::vector<DataType> vec_custom_dtype;
+          for (size_t i = 0; i < ctx->InputSize(in_name); ++i) {
+            auto dtype = ctx->GetInputDataType(in_name, i);
+            vec_custom_dtype.emplace_back(
+                CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype));
+          }
+          vec_input_dtypes.emplace_back(vec_custom_dtype);
+        } else {
+          auto dtype = ctx->GetInputDataType(in_name);
+          input_dtypes.emplace_back(
+              CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype));
+        }
       }
 
       VLOG(1) << "Custom Operator: InferDtype - infer output dtype.";
-      auto output_dtypes = infer_dtype_func(input_dtypes);
+      auto output_dtypes = infer_dtype_func(input_dtypes, vec_input_dtypes);
 
       VLOG(1) << "Custom Operator: InferDtype - set output dtype.";
       for (size_t i = 0; i < op_outputs.size(); ++i) {
-        ctx->SetOutputDataType(
-            op_outputs[i],
-            CustomTensorUtils::ConvertEnumDTypeToInnerDType(output_dtypes[i]));
+        auto out_name = op_outputs[i];
+        if (detail::IsDuplicableVar(out_name)) {
+          for (size_t j = 0; j < output_dtypes.size(); ++j) {
+            auto dtype = CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+                output_dtypes[i]);
+            ctx->SetOutputDataType(out_name, dtype, j);
+          }
+        } else {
+          ctx->SetOutputDataType(
+              out_name, CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+                            output_dtypes[i]));
+        }
       }
     };
   }
@@ -644,10 +798,39 @@ void RegisterOperatorWithMetaInfo(
       return new CustomOperator(type, inputs, outputs, attrs);
     };
 
-    // Grad InferShape (gradient's shape is same with forward input default)
-    grad_info.infer_shape_ = [grad_op_outputs](InferShapeContext* ctx) {
+    // Grad InferShape
+    grad_info.infer_shape_ = [grad_op_inputs,
+                              grad_op_outputs](InferShapeContext* ctx) {
+      // 1. if forward input exists, gradient's shape is same with forward input
+      // default
+      //    [Suitable for most situations]
+      // 2. if forward input not exists, and only contains one grad input and
+      // output,
+      //    use grad input shape as grad output shape
+      //    [Suitable for the situation that forward input is not used as
+      //    backward input]
+      // TODO(chenweihang): support set grad op infershape func if needed
       for (auto& out_name : grad_op_outputs) {
-        ctx->ShareDim(detail::NoGrad(out_name), out_name);
+        auto fwd_name = detail::NoGrad(out_name);
+        if (detail::IsDuplicableVar(fwd_name)) {
+          // Duplicable forward var must as backward input
+          ctx->ShareDim(fwd_name, out_name);
+        } else {
+          if (ctx->HasInput(fwd_name)) {
+            ctx->ShareDim(fwd_name, out_name);
+          } else {
+            PADDLE_ENFORCE_EQ(
+                grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL,
+                true,
+                platform::errors::Unavailable(
+                    "Custom grad operator infershape error. "
+                    "If a custom grad operator contains only one input and "
+                    "only one output, the input shape will be directly set to "
+                    "the output shape. Otherwise, Please set the forward input "
+                    "as the grad operator's input."));
+            ctx->ShareDim(grad_op_inputs[0], out_name);
+          }
+        }
       }
     };
 
diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h
index 117841f80cf47ed95251fee1d01f7fd87caa600b..259901c09f3e00729876d7bea062237ad5bad94a 100644
--- a/paddle/fluid/framework/custom_operator.h
+++ b/paddle/fluid/framework/custom_operator.h
@@ -28,5 +28,8 @@ void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
 void RegisterOperatorWithMetaInfoMap(
     const paddle::OpMetaInfoMap& op_meta_info_map);
 
+// Interface for selective register custom op.
+void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 2e42248f64bec7d3d30271a61688e8530bf34b83..a65dcbd55f94630612ce59b4d07b0789aaf7c697 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -109,6 +109,12 @@ void GroupTestCopy() {
   TestCopyTensor<int8_t>();
   VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<uint8_t>();
+  VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::complex64>();
+  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::complex128>();
+  VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::float16>();
 }
 
 void GroupTestCast() {
@@ -126,6 +132,12 @@ void GroupTestCast() {
   TestCast<uint8_t>(paddle::DataType::FLOAT32);
   VLOG(2) << "float cast";
   TestCast<float>(paddle::DataType::FLOAT32);
+  VLOG(2) << "complex64 cast";
+  TestCast<paddle::complex64>(paddle::DataType::FLOAT32);
+  VLOG(2) << "complex128 cast";
+  TestCast<paddle::complex128>(paddle::DataType::FLOAT32);
+  VLOG(2) << "float16 cast";
+  TestCast<paddle::float16>(paddle::DataType::FLOAT16);
 }
 
 void GroupTestDtype() {
@@ -136,6 +148,9 @@ void GroupTestDtype() {
   CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
   CHECK(TestDtype<int8_t>() == paddle::DataType::INT8);
   CHECK(TestDtype<uint8_t>() == paddle::DataType::UINT8);
+  CHECK(TestDtype<paddle::complex64>() == paddle::DataType::COMPLEX64);
+  CHECK(TestDtype<paddle::complex128>() == paddle::DataType::COMPLEX128);
+  CHECK(TestDtype<paddle::float16>() == paddle::DataType::FLOAT16);
 }
 
 void GroupTestDtypeConvert() {
@@ -162,6 +177,15 @@ void GroupTestDtypeConvert() {
         paddle::framework::proto::VarType::INT16);
   CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
             paddle::DataType::BOOL) == paddle::framework::proto::VarType::BOOL);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::COMPLEX64) ==
+        paddle::framework::proto::VarType::COMPLEX64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::COMPLEX128) ==
+        paddle::framework::proto::VarType::COMPLEX128);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::FLOAT16) ==
+        paddle::framework::proto::VarType::FP16);
   // proto -> enum
   CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
             paddle::framework::proto::VarType::FP64) ==
@@ -185,6 +209,30 @@ void GroupTestDtypeConvert() {
         paddle::DataType::INT16);
   CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
             paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::COMPLEX64) ==
+        paddle::DataType::COMPLEX64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::COMPLEX128) ==
+        paddle::DataType::COMPLEX128);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::FP16) ==
+        paddle::DataType::FLOAT16);
+}
+
+void TestInitilized() {
+  paddle::Tensor test_tensor(paddle::PlaceType::kCPU);
+  CHECK(test_tensor.is_initialized() == false);
+  test_tensor.reshape({1, 1});
+  test_tensor.mutable_data<float>();
+  CHECK(test_tensor.is_initialized() == true);
+  float* tensor_data = test_tensor.data<float>();
+  for (int i = 0; i < test_tensor.size(); i++) {
+    tensor_data[i] = 0.5;
+  }
+  for (int i = 0; i < test_tensor.size(); i++) {
+    CHECK(tensor_data[i] == 0.5);
+  }
 }
 
 TEST(CustomTensor, copyTest) {
@@ -200,4 +248,6 @@ TEST(CustomTensor, copyTest) {
   GroupTestCast();
   VLOG(2) << "TestDtypeConvert";
   GroupTestDtypeConvert();
+  VLOG(2) << "TestInitilized";
+  TestInitilized();
 }
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
index 919a3a1a49c73b9a3e06265485ef08c7108a8082..809a6b965aad9bcb4594ecff99e460db723dfd53 100644
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -37,7 +37,7 @@ class CustomTensorUtils {
   /// \brief Share data FROM another tensor.
   /// Use this to pass tensor from op to op
   /// \return void.
-  static void ShareDataFrom(const void* src, const Tensor& dst);
+  static void ShareDataFrom(const void* src, const paddle::Tensor& dst);
 
   static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType(
       const paddle::DataType& dtype) {
@@ -56,6 +56,12 @@ class CustomTensorUtils {
         return framework::proto::VarType::INT64;
       case paddle::DataType::INT16:
         return framework::proto::VarType::INT16;
+      case paddle::DataType::COMPLEX64:
+        return framework::proto::VarType::COMPLEX64;
+      case paddle::DataType::COMPLEX128:
+        return framework::proto::VarType::COMPLEX128;
+      case paddle::DataType::FLOAT16:
+        return framework::proto::VarType::FP16;
       case paddle::DataType::BOOL:
         return framework::proto::VarType::BOOL;
       default:
@@ -83,6 +89,12 @@ class CustomTensorUtils {
         return paddle::DataType::UINT8;
       case framework::proto::VarType::INT16:
         return paddle::DataType::INT16;
+      case framework::proto::VarType::COMPLEX64:
+        return paddle::DataType::COMPLEX64;
+      case framework::proto::VarType::COMPLEX128:
+        return paddle::DataType::COMPLEX128;
+      case framework::proto::VarType::FP16:
+        return paddle::DataType::FLOAT16;
       case framework::proto::VarType::BOOL:
         return paddle::DataType::BOOL;
       default:
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 7aa7b7b2d96cf93dd2c0e3ba34b14307c230fa81..c8f73a5469ab32a5734d980010a52a6f72eb6ca8 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 34c87b8388975aa108bbb2ecdaded1ff4a33d16c..5636e3ed1b63f9e4b9854ebcaac4a84a4c20176b 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -161,9 +161,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
-#else
-    LOG(WARNING) << "fusion_group is not enabled for Windows/MacOS now, and "
-                    "only effective when running with CUDA GPU.";
 #endif
     AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                         "fuse_elewise_add_act_pass");
@@ -265,12 +262,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     if (FLAGS_use_mkldnn) {
       AppendPass(pass_name);
     } else if (!strategy_.mkldnn_enabled_op_types_.empty()) {
-      LOG(WARNING)
-          << "mkldnn_enabled_op_types specify the operator type list to "
-             "use MKLDNN acceleration. It is null in default, means "
-             "that all the operators supported by MKLDNN will be "
-             "accelerated. And it should not be set when "
-             "FLAGS_use_mkldnn=false.";
+      VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to "
+                 "use MKLDNN acceleration. It is null in default, means "
+                 "that all the operators supported by MKLDNN will be "
+                 "accelerated. And it should not be set when "
+                 "FLAGS_use_mkldnn=false.";
     }
 #else
     PADDLE_ENFORCE_NE(FLAGS_use_mkldnn, true,
@@ -403,26 +399,26 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
               << ", num_trainers:" << num_trainers_;
     } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
-                        "GPU, skipped.";
+        VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on "
+                   "GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "fusion_group_pass") {
       pass->Set<bool>("use_gpu", new bool((use_device == p::kCUDA)));
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped.";
+        VLOG(1) << "fusion_group_pass is only supported on GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "fuse_bn_act_pass") {
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fuse_bn_act_pass is only supported on "
-                        "GPU, skipped.";
+        VLOG(1) << "fuse_bn_act_pass is only supported on "
+                   "GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "fuse_bn_add_act_pass") {
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fuse_bn_add_act_pass is only supported on "
-                        "GPU, skipped.";
+        VLOG(1) << "fuse_bn_add_act_pass is only supported on "
+                   "GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "mkldnn_placement_pass") {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 103dd0c5ae599b8126ef63fb8ae456846a2f1966..0fdb97db20af992998d94e37263f415a84cd1ba1 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -354,8 +354,36 @@ void CheckVarHasNanOrInf(const std::string& op_type,
         var_name));
 #endif
     return;
-  }
+  } else if (platform::is_npu_place(tensor->place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (tensor->type() != proto::VarType::FP32) {
+      return;
+    }
+
+    framework::LoDTensor cpu_tensor;
+    cpu_tensor.Resize(tensor->dims());
+    float* cpu_data = static_cast<float*>(
+        cpu_tensor.mutable_data(platform::CPUPlace(), tensor->type()));
 
+    framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    bool flag = false;
+    for (int i = 0; i < cpu_tensor.numel(); i++) {
+      if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
+        flag = true;
+        break;
+      }
+    }
+    PADDLE_ENFORCE_NE(
+        flag, true,
+        platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
+                                op_type, var_name));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Tensor[%s] use npu place. PaddlePaddle must compile with NPU.",
+        var_name));
+#endif
+    return;
+  }
   tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
 }
 
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 3038719539251cd6e5aec5692aab2b695b4212cd..84369011476c77765dc5396830adc34f775fbb50 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -28,7 +28,8 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/data_feed.h"
-#include "paddle/fluid/framework/heter_service.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -204,7 +205,7 @@ class DeviceWorker {
   Scope* root_scope_ = nullptr;
   Scope* thread_scope_;
   paddle::platform::Place place_;
-  int64_t batch_num_;
+  int64_t batch_num_ = 0;
   FetchConfig fetch_config_;
   bool use_cvm_;
   bool no_cvm_;
@@ -265,6 +266,9 @@ class HogwildWorker : public CPUWorkerBase {
   HogwildWorkerParameter param_;
   std::vector<std::string> skip_ops_;
   std::map<std::string, int> stat_var_name_map_;
+#ifdef PADDLE_WITH_HETERPS
+  platform::DeviceContext* dev_ctx_ = nullptr;
+#endif
 };
 
 class DownpourWorker : public HogwildWorker {
@@ -454,7 +458,7 @@ class HeterBoxWorker : public HogwildWorker {
   virtual void CacheProgram(const ProgramDesc& main_program) {
     new (&program_) ProgramDesc(main_program);
   }
-  virtual void ProduceTasks() override;
+  void ProduceTasks() override;
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
   virtual void TrainFilesWithProfiler() {}
@@ -555,13 +559,12 @@ class PSGPUWorker : public HogwildWorker {
   virtual void CacheProgram(const ProgramDesc& main_program) {
     new (&program_) ProgramDesc(main_program);
   }
-  virtual void ProduceTasks() override;
+  void ProduceTasks() override;
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
   void ResetStat();
 
  protected:
-  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
   void PushGradients();
   void DumpParam();
   void CopySparseTable();
@@ -638,7 +641,8 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 class SectionWorker : public DeviceWorker {
  public:
   SectionWorker() {}
@@ -659,6 +663,9 @@ class SectionWorker : public DeviceWorker {
   void SetDeviceIndex(int tid) override {}
   void SetThreadIndex(int thread_id) { thread_id_ = thread_id; }
   void SetMicrobatchNum(int num) { num_microbatches_ = num; }
+  void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; }
+  void SetPipelineStage(int stage) { pipeline_stage_ = stage; }
+  void SetScheduleMode(int mode) { schedule_mode_ = mode; }
   void SetMicrobatchScopes(const std::vector<Scope*>& scope) {
     microbatch_scopes_ = scope;
   }
@@ -666,11 +673,23 @@ class SectionWorker : public DeviceWorker {
   void SetSkipVars(const std::vector<std::string>& skip_vars) {
     skip_vars_ = skip_vars;
   }
+  void RunBackward(
+      int micro_id, std::unique_ptr<GarbageCollector>&,
+      std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
+  void RunForward(
+      int micro_id, std::unique_ptr<GarbageCollector>&,
+      std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
+  void RunUpdate(
+      std::unique_ptr<GarbageCollector>&,
+      std::unordered_map<const OperatorBase*, std::vector<std::string>>&);
 
  protected:
   int section_id_;
   int thread_id_;
   int num_microbatches_;
+  int num_pipeline_stages_;
+  int pipeline_stage_;
+  int schedule_mode_;  // 0 for F-then-B and 1 for 1F1B
   std::vector<Scope*> microbatch_scopes_;
   std::vector<std::string> skip_vars_;
   const Scope* minibatch_scope_;
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index a539a5d5f96b52eea852bc39b0081ea92ccfffc1..fb2323d96e2916f0b5c5a62fdb7c27341924bbe0 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -79,7 +79,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 300f0eb0dbb50fa1437ac05bd01b8d27f3aaef34..d102fcdbe0cec14144d54f82e62760e8a1ceaec2 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -29,9 +29,24 @@ message RecomputeConfig {
 }
 
 message ShardingConfig {
-  optional float fuse_broadcast_MB = 1 [ default = 32.0 ];
-  optional bool hybrid_dp = 2 [ default = false ];
-  optional int32 sharding_group_size = 3 [ default = 8 ];
+  optional string sharding_segment_strategy = 1
+      [ default = 'segment_broadcast_MB' ];
+  optional float segment_broadcast_MB = 2 [ default = 32.0 ];
+  repeated string segment_anchors = 3;
+  optional int32 sharding_degree = 4 [ default = 8 ];
+  optional int32 mp_degree = 5 [ default = 1 ];
+  optional int32 dp_degree = 6 [ default = 1 ];
+  optional bool hybrid_dp = 7 [ default = false ];
+  optional int32 gradient_merge_acc_step = 8 [ default = 1 ];
+  optional bool optimize_offload = 9 [ default = false ];
+  optional bool pp_allreduce_in_optimize = 10 [ default = false ];
+  optional int32 pp_degree = 11 [ default = 1 ];
+}
+
+message HybridConfig {
+  optional int32 dp_degree = 1 [ default = -1 ];
+  optional int32 mp_degree = 2 [ default = 1 ];
+  optional int32 pp_degree = 3 [ default = 1 ];
 }
 
 message AMPConfig {
@@ -115,11 +130,17 @@ message AsyncConfig {
   optional bool launch_barrier = 9 [ default = true ];
   optional string heter_worker_device_guard = 10 [ default = 'cpu' ];
   optional int32 lr_decay_steps = 11 [ default = 10 ];
+  optional int32 use_ps_gpu = 12 [ default = 0 ];
 }
 
 message PipelineConfig {
   optional int32 micro_batch_size = 1 [ default = 1 ];
   optional int32 accumulate_steps = 2 [ default = 1 ];
+  optional string schedule_mode = 3 [ default = '1F1B' ];
+}
+
+message TensorParallelConfig {
+  optional int32 tensor_parallel_degree = 1 [ default = 1 ];
 }
 
 message DistributedStrategy {
@@ -151,6 +172,9 @@ message DistributedStrategy {
   optional bool fp16_allreduce = 25 [ default = false ];
   optional bool sharding = 26 [ default = false ];
   optional float last_comm_group_size_MB = 27 [ default = 1 ];
+  optional bool find_unused_parameters = 28 [ default = false ];
+  optional bool tensor_parallel = 29 [ default = false ];
+  optional bool without_graph_optimization = 30 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -163,6 +187,8 @@ message DistributedStrategy {
   optional LambConfig lamb_configs = 109;
   optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
   optional ShardingConfig sharding_configs = 111;
+  optional HybridConfig hybrid_configs = 112;
+  optional TensorParallelConfig tensor_parallel_configs = 113;
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
 }
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index a3fbb008fe4f444b7ad5b1fb3eb695ca4b4c7796..b99ab6b5a7ff195ef7d659598df88467bb158c6e 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -82,6 +82,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
         platform::errors::Unimplemented("platform::XPUPlace is not supported"));
   }
 
+  inline ::DLContext operator()(const platform::NPUPlace &place) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("platform::NPUPlace is not supported"));
+  }
+
   inline ::DLContext operator()(const platform::CUDAPlace &place) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     ::DLContext ctx;
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 0acc8a55fa9f8a79c67b7beb732996c86f86ec5a..de007c128d7543c1433426e80abcbd80ee47dee8 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -72,7 +72,7 @@ Executor::~Executor() {
 #ifdef PADDLE_WITH_MKLDNN
   // Clear mkl-dnn cache,
   // this is needed to have mkl-dnn unit tests working
-  ClearMKLDNNCache(place_);
+  ClearMKLDNNCache(place_, this);
 #endif
 }
 
@@ -169,6 +169,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool force_disable_gc, bool keep_kid_scopes) {
   platform::RecordBlock b(block_id);
   if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
   auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars,
                      keep_kid_scopes);
@@ -294,6 +297,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                    const std::string& fetch_holder_name) {
   platform::RecordBlock b(kProgramId);
   if (FLAGS_use_mkldnn) EnableMKLDNN(program);
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
   bool has_feed_ops =
       has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
   bool has_fetch_ops =
@@ -453,6 +459,25 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #else
       PADDLE_THROW(
           platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
+#endif
+    } else if (platform::is_npu_place(place_)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      if (IsFastEagerDeletionModeEnabled()) {
+        VLOG(4) << "Use unsafe fast gc for NPU.";
+        gc.reset(new NPUUnsafeFastGarbageCollector(
+            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Please set FLAGS_fast_eager_deletion_mode=true to use "
+            "GarbageCollector on NPU."));
+        // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
+        VLOG(4) << "Use default stream gc for NPU.";
+        gc.reset(new NPUDefaultStreamGarbageCollector(
+            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      }
+#else
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle"));
 #endif
     }
   }
@@ -557,7 +582,6 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
       }
     }
   }
-  platform::AttachPointerHashToMKLDNNKey(this, place_);
 #else
   LOG(WARNING)
       << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h
index 7593b60abfffcd9a0a3e9f743930660327c1409e..9c9f29520de439ee209ced19f448bde9905b231b 100644
--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -20,14 +20,12 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/trainer.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h
index 782018d1cfe109c3a0cb4919969665207dcfbc9e..3beeacb1010d2687ac0dfd58092773f52c4fafdc 100644
--- a/paddle/fluid/framework/executor_cache.h
+++ b/paddle/fluid/framework/executor_cache.h
@@ -22,8 +22,10 @@
 #include <vector>
 
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc
index c8bc735790400bbc1552c294602275bbf9ab90d4..c06a3d4a183799c7c8ca130f9ff48e7bff23a3bd 100644
--- a/paddle/fluid/framework/executor_gc_helper.cc
+++ b/paddle/fluid/framework/executor_gc_helper.cc
@@ -18,7 +18,6 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 61f3c026f1facc6afb2b9b45316b1205cf676904..a9e4691dd0a01544e1d75d3d27dce43585081837 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -1,20 +1,27 @@
 if(WITH_PSLIB)
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib)
+    if(WITH_PSLIB_BRPC)
+        set(BRPC_DEPS pslib_brpc)
+    else()
+        set(BRPC_DEPS brpc)
+    endif(WITH_PSLIB_BRPC)
+    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope ${BRPC_DEPS} pslib)
+else()
+    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
+endif(WITH_PSLIB)
+
+if(WITH_HETERPS)
     if(WITH_NCCL)
         nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-        DEPS heter_ps)
+        DEPS heter_ps ${BRPC_DEPS})
         add_subdirectory(heter_ps)
     elseif(WITH_RCCL)
         hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-        DEPS heter_ps)
+        DEPS heter_ps ${BRPC_DEPS})
         add_subdirectory(heter_ps)
-    else()
-        cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
     endif(WITH_NCCL)
 else()
-    cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
     cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc)
-endif(WITH_PSLIB)
+endif(WITH_HETERPS)
 
 if(WITH_NCCL OR WITH_RCCL)
     cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
@@ -37,10 +44,20 @@ else()
     cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_GLOO)
 
-cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
+if(WITH_PSLIB)
+set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(DISTRIBUTE_COMPILE_FLAGS
+            "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+endif()
+set_source_files_properties(heter_wrapper.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+endif()
+
+cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto
+device_context heter_service_proto ${BRPC_DEPS})
 
 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
 
-if(WITH_ASCEND)
-    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend ascend_graph)
-endif(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
+    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
+endif()
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc
index d1b2f51f700363cf319344ab35b10af545c0373a..273939f6bee613f44353939435a71e571074d4f2 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index da79fccb8ca69fac0f34f8092f296b9923e5f849..f749ee8cfa0baa410e3b6c4b67de07bdd30611ab 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include <glog/logging.h>
 
 #include <map>
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 
 #include "ge/ge_api.h"
-#include "ge/ge_api_types.h"
 #include "graph/attr_value.h"
 #include "graph/tensor.h"
 #include "graph/types.h"
@@ -37,25 +36,50 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-// typedef std::vector<std::string> AscendGraphDesc;
 typedef ge::Graph AscendGraphDesc;
 
+#ifdef PADDLE_WITH_ASCEND_STRING
+using AscendString = ge::AscendString;
+#else
+using AscendString = std::string;
+#endif
+
 class AscendInstance {
  public:
   virtual ~AscendInstance() {}
   AscendInstance() {}
 
-  std::map<std::string, std::string> GetDefaultInitSessionOptions() {
-    std::map<std::string, std::string> init_options;
-    init_options["a"] = "b";
-    init_options["ge.trainFlag"] = "1";
+  std::map<AscendString, AscendString> _GetDefaultInitOptions() {
+    std::map<AscendString, AscendString> init_options;
+    init_options["ge.exec.deviceId"] = "0";
+    init_options["ge.graphRunMode"] = "1";
+    return init_options;
+  }
+
+  std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
+    std::map<AscendString, AscendString> init_options;
+    // init_options["a"] = "b";
+    // init_options["ge.trainFlag"] = "1";
     return init_options;
   }
 
-  // add other parameters here to init
+  ge::Status InitGEForUT() {
+    return ge::GEInitialize(_GetDefaultInitOptions());
+  }
+
   void InitGlobalResouces() {
-    session_.reset(new ge::Session(GetDefaultInitSessionOptions()));
-    VLOG(1) << "InitGlobalResouces Done";
+    LOG(INFO) << "Begin ascend InitGlobalResouces";
+    session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
+    if (session_ == nullptr) {
+      PADDLE_THROW(platform::errors::Fatal("new session error: nullptr"));
+    }
+    LOG(INFO) << "End ascend InitGlobalResouces";
+  }
+
+  void DestroyGlobalResouces() {
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
+    session_ = nullptr;
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
   }
 
   static std::shared_ptr<AscendInstance> GetInstance() {
@@ -178,6 +202,6 @@ class AscendInstance {
  private:
   static std::shared_ptr<AscendInstance> ascend_instance_;
 };
-}  // end namespace framework
-}  // end namespace paddle
+}  // namespace framework
+}  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index e584fb5e2b9ca77923161c8c89c2e7784c5d164b..09f7801b19f988bb7c0948b127b79e6d848629be 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -28,12 +28,15 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/fluid/framework/heter_service.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/platform/type_defs.h"
+#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index a02931b3f5c28a6e8e09866f3352109b7fe91adb..1fb2f0fab4aff9926f6ed6c30fa72cb9e9c93cf6 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -14,15 +14,21 @@ limitations under the License. */
 
 #pragma once
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 
 #include <algorithm>
 #include <map>
 #include <unordered_map>
 #include <vector>
 
+#ifdef PADDLE_WITH_PSLIB
 #include "common_value.h"  // NOLINT
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#endif
+
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -39,7 +45,12 @@ class HeterContext {
   }
   Scope* scope_{nullptr};
   std::vector<std::vector<FeatureKey>> feature_keys_;
+#ifdef PADDLE_WITH_PSLIB
   std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> value_ptr_;
+#endif
+#ifdef PADDLE_WITH_PSCORE
+  std::vector<std::vector<paddle::distributed::VALUE*>> value_ptr_;
+#endif
   std::vector<std::vector<FeatureValue>> device_values_;
   std::vector<std::vector<FeatureKey>> device_keys_;
   std::vector<std::mutex*> mutex_;
@@ -66,6 +77,21 @@ class HeterContext {
       mutex_[i] = new std::mutex();
     }
   }
+
+  void Reset() {
+    for (size_t i = 0; i < feature_keys_.size(); ++i) {
+      feature_keys_[i].clear();
+    }
+    for (size_t i = 0; i < value_ptr_.size(); ++i) {
+      value_ptr_[i].clear();
+    }
+    for (size_t i = 0; i < device_values_.size(); ++i) {
+      device_values_[i].clear();
+    }
+    for (size_t i = 0; i < device_keys_.size(); ++i) {
+      device_keys_[i].clear();
+    }
+  }
   void batch_add_keys(
       const std::vector<std::unordered_set<uint64_t>>& thread_keys) {
     assert(thread_keys.size() == feature_keys_.size());
@@ -79,6 +105,15 @@ class HeterContext {
     }
   }
 
+  void batch_add_keys(int shard_num,
+                      const std::unordered_set<uint64_t>& shard_keys) {
+    int idx = feature_keys_[shard_num].size();
+    feature_keys_[shard_num].resize(feature_keys_[shard_num].size() +
+                                    shard_keys.size());
+    std::copy(shard_keys.begin(), shard_keys.end(),
+              feature_keys_[shard_num].begin() + idx);
+  }
+
   void UniqueKeys() {
     std::vector<std::thread> threads;
     auto unique_func = [this](int i) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 6df2cd52bb401d3cc378c2776073471070f1e411..67c44368b7aea471a4e73eaf6132c85883068d58 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -1,5 +1,13 @@
 IF(WITH_GPU)
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
+    SET(HETERPS_DEPS device_context)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        SET(HETERPS_DEPS ${HETERPS_DEPS} cub)
+    endif()
+    if(WITH_PSCORE)
+        get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+        SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
+    endif()
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
 ENDIF()
diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
index 698ece09de6c50781662659e317f4b1fc8f340b1..c3bf33b32c2daf298ddc9af546c4c047bf6e9a6e 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 #include <iostream>
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
index e5c0972763bede000961e970390c64431ac3cb22..3782e14ad41a5ed6ce5ef1eb0788842d03ecddc7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h
@@ -17,11 +17,17 @@ limitations under the License. */
 #include <limits>
 #include <memory>
 #include <vector>
+#ifdef PADDLE_WITH_PSLIB
 #include "common_value.h"  // NOLINT
+#endif
+#ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/table/depends/large_scale_kv.h"
+#endif
 #include "thrust/pair.h"
 //#include "cudf/concurrent_unordered_map.cuh.h"
 #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/platform/type_defs.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
index 871f9c7857af46d8aad7cfbfafcdc80f0f52f259..098c795fc7e1f968491a05a241aa397b53eea7f9 100644
--- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
@@ -119,6 +119,7 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
       continue;
     }
     ValType& gpu_val = kv[i].second;
+#ifdef PADDLE_WITH_PSLIB
     auto* downpour_value =
         (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr);
     int downpour_value_size = downpour_value->size();
@@ -138,6 +139,14 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, cudaStream_t stream) {
         cpu_val[x + 7] = gpu_val.mf[x];
       }
     }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr);
+    downpour_value->count_ = gpu_val.show;
+    for (int x = 0; x < gpu_val.mf_size; x++) {
+      downpour_value->data_[x] = gpu_val.mf[x];
+    }
+#endif
   }
 
   container_->prefetch(devid, stream);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 0e38ebbd7f4e7280d5571ffd216143b277f063d6..2ec2a8a1f1e223c85ffbf26ecbaccc434ca055f7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "thrust/pair.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
@@ -182,7 +182,7 @@ class HeterComm {
   std::vector<std::vector<Path>> path_;
   std::vector<LocalStorage> storage_;
   int feanum_{1800 * 2048};
-  int multi_node_{1};
+  int multi_node_{0};
   std::vector<ncclComm_t> nccl_inner_comms_;
   std::vector<ncclComm_t> nccl_inter_comms_;
   int node_size_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 2f1c809c01eaadcad8c3406e882acafaadb09134..1b4205e3c38fe27419c4ba42e6950b581db62a99 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#ifdef PADDLE_WITH_HETERPS
 #include <queue>
 
-#ifdef PADDLE_WITH_PSLIB
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index f2e129ded9fefc58e40e477142cd56de2c0a3448..581b0d511c23ee070b6dc33af315cc420f6ef20a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
@@ -54,8 +54,8 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
 
 void HeterPs::push_sparse(int num, FeatureKey* d_keys,
                           FeaturePushValue* d_grads, size_t len) {
-  // comm_->push_sparse(num, d_keys, d_grads, len, opt_);
-  comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
+  comm_->push_sparse(num, d_keys, d_grads, len, opt_);
+  // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
 }
 
 void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index 142f4a93b93a29410f02ef32fb7d7bd08fb6654f..d78b6b492074deb9f3dcc1073e951353c0846abb 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
 #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 7980220eab9b9b6f36d6449c5f553daeee2e36f3..05b3ecf9c3c12c6b4df1192785b0659a8ef851d0 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
index f65b664f83ba0dd3a383d9443d67679cef3a509c..0f2af2a522e287dd9361ba8448441a41b7293794 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 #include "heter_resource.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
index ad7649a8a33cb77e0581429d1da202b2d218b0dc..7b23379994c735eb1f8ad9d5c5d787388d08ce8d 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
index b3ec9e752e62bb01a73f2d2070f94a47f8fe0730..7e82a8e014fd3cb33b706c9fc5c1e671392e05a7 100644
--- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
+++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "optimizer_conf.h"
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h
index 871d2e251b41016d548fa1e257560aca9db030d7..4e529de077593777c1ab326db395febaefb9564a 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_PSLIB
 #include "paddle/fluid/framework/heter_service.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable_helper.h"
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 4274876c9975e5b16824af4c799bf81228659d92..67ff6b6acaefb26adc1389559a763b98f41a533a 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -26,8 +26,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 
 #include <algorithm>
 #include <deque>
@@ -58,7 +57,12 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
   auto& device_mutex = gpu_task->mutex_;
 
   std::vector<std::thread> threads;
+#ifdef PADDLE_WITH_PSLIB
   auto fleet_ptr = FleetWrapper::GetInstance();
+#endif
+#ifdef PADDLE_WITH_PSCORE
+  auto fleet_ptr = paddle::distributed::Communicator::GetInstance();
+#endif
 
   // data should be in input channel
   thread_keys_.resize(thread_keys_thread_num_);
@@ -99,12 +103,26 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
 
   timeline.Start();
 
+  threads.clear();
   // merge thread_keys to shard_keys
-  for (size_t i = 0; i < thread_keys_.size(); i++) {
-    gpu_task->batch_add_keys(thread_keys_[i]);
-    for (int j = 0; j < thread_keys_thread_num_; j++) {
-      thread_keys_[i][j].clear();
+  auto merge_ins_func = [this, gpu_task](int shard_num) {
+    for (int i = 0; i < thread_keys_thread_num_; ++i) {
+      gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]);
+      thread_keys_[i][shard_num].clear();
     }
+  };
+
+  // for (size_t i = 0; i < thread_keys_.size(); i++) {
+  //  gpu_task->batch_add_keys(thread_keys_[i]);
+  //  for (int j = 0; j < thread_keys_thread_num_; j++) {
+  //    thread_keys_[i][j].clear();
+  //  }
+  //}
+  for (int i = 0; i < thread_keys_shard_num_; ++i) {
+    threads.push_back(std::thread(merge_ins_func, i));
+  }
+  for (auto& t : threads) {
+    t.join();
   }
   timeline.Pause();
 
@@ -124,9 +142,16 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
   auto ptl_func = [this, &local_keys, &local_ptr, &table_id,
                    &fleet_ptr](int i) {
     size_t key_size = local_keys[i].size();
+#ifdef PADDLE_WITH_PSLIB
     auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr(
         reinterpret_cast<char**>(local_ptr[i].data()), table_id,
         local_keys[i].data(), key_size);
+#endif
+#ifdef PADDLE_WITH_PSCORE
+    auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr(
+        reinterpret_cast<char**>(local_ptr[i].data()), table_id,
+        local_keys[i].data(), key_size);
+#endif
     tt.wait();
     auto status = tt.get();
     // auto status = 0;
@@ -153,8 +178,14 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
   auto build_func = [device_num, &local_keys, &local_ptr, &device_keys,
                      &device_vals, &device_mutex](int i) {
     std::vector<std::vector<FeatureKey>> task_keys(device_num);
+#ifdef PADDLE_WITH_PSLIB
     std::vector<std::vector<paddle::ps::DownpourFixedFeatureValue*>> task_ptrs(
         device_num);
+#endif
+
+#ifdef PADDLE_WITH_PSCORE
+    std::vector<std::vector<paddle::distributed::VALUE*>> task_ptrs(device_num);
+#endif
 
     for (size_t j = 0; j < local_keys[i].size(); j++) {
       int shard = local_keys[i][j] % device_num;
@@ -169,7 +200,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
       int cur = device_keys[dev].size();
       device_keys[dev].resize(device_keys[dev].size() + len);
       device_vals[dev].resize(device_vals[dev].size() + len);
-
+#ifdef PADDLE_WITH_PSLIB
       for (int j = 0; j < len; ++j) {
         device_keys[dev][cur + j] = task_keys[dev][j];
         float* ptr_val = task_ptrs[dev][j]->data();
@@ -196,6 +227,35 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
           }
         }
       }
+#endif
+#ifdef PADDLE_WITH_PSCORE
+      for (int j = 0; j < len; ++j) {
+        device_keys[dev][cur + j] = task_keys[dev][j];
+        distributed::VALUE* ptr_val = task_ptrs[dev][j];
+        FeatureValue& val = device_vals[dev][cur + j];
+        bool has_mf = 1;
+        val.delta_score = 0;
+        val.show = ptr_val->count_;
+        val.clk = 0;
+        val.slot = 0;
+        val.lr = 0;
+        val.lr_g2sum = 0;
+        val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]);
+
+        if (has_mf) {
+          val.mf_size = MF_DIM + 1;
+          for (int x = 0; x < val.mf_size; x++) {
+            val.mf[x] = ptr_val->data_[x];
+          }
+        } else {
+          val.mf_size = 0;
+          for (int x = 0; x < MF_DIM + 1; x++) {
+            val.mf[x] = 0;
+          }
+        }
+      }
+#endif
+      VLOG(1) << "GpuPs build hbmps done";
 
       device_mutex[dev]->unlock();
     }
@@ -215,6 +275,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
 void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
   int device_num = heter_devices_.size();
   std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
+  gpu_task->Reset();
   BuildTask(gpu_task, table_id, feature_dim);
   platform::Timer timeline;
   timeline.Start();
@@ -227,8 +288,8 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
     size_max = std::max(size_max, feature_keys_count[i]);
   }
   if (HeterPs_) {
-    HeterPs_->show_one_table(0);
-    return;
+    delete HeterPs_;
+    HeterPs_ = nullptr;
   }
   std::vector<std::thread> threads(device_num);
   HeterPs_ = HeterPsBase::get_instance(size_max, resource_);
@@ -249,6 +310,7 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
   timeline.Pause();
   VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec()
           << " s.";
+  gpu_task_pool_.Push(gpu_task);
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 2eedcd5f1c70052533243540e0ca432dea24725d..2bf564d3f76d5a7039bf8ff03a00b66abfb84171 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_PSLIB
+#ifdef PADDLE_WITH_HETERPS
 #include <algorithm>
 #include <ctime>
 #include <memory>
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index ef586b41fe05d2f21e1469dcd7bcce3d77fc9651..cfb23d1be2acfed0a878cb3bffa241afa2cf3de8 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -14,8 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 
 #include <atomic>
 #include <ctime>
@@ -26,7 +25,6 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-
 #ifdef PADDLE_WITH_GLOO
 #include <gloo/broadcast.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
@@ -42,6 +40,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 #include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_PSCORE
+#include "paddle/fluid/distributed/service/communicator.h"
+#endif
 
 namespace paddle {
 namespace framework {
@@ -219,7 +220,7 @@ class PSGPUWrapper {
   std::shared_ptr<HeterPsResource> resource_;
   int32_t sleep_seconds_before_fail_exit_;
   std::vector<int> slot_vector_;
-  int multi_node_{1};
+  int multi_node_{0};
   int node_size_;
   std::vector<ncclComm_t> inner_comms_;
   std::vector<ncclComm_t> inter_comms_;
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index c8b6c7642551756273ebecb83093e1d75d131f2c..9ab6b5d8c178b9272241ce8db600f3ea32276988 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -86,8 +86,9 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
   PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_));
 #else
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
+  callback_manager_.reset(
+      new platform::StreamCallbackManager<gpuStream_t>(stream_));
 #endif
-  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
 }
 
 StreamGarbageCollector::~StreamGarbageCollector() {
@@ -121,6 +122,32 @@ void CUDAPinnedGarbageCollector::ClearCallback(
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector(
+    const platform::NPUPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void NPUDefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void NPUDefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector(
+    const platform::NPUPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void NPUUnsafeFastGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+#endif
+
 int64_t GetEagerDeletionThreshold() {
   return FLAGS_eager_delete_tensor_gb < 0
              ? -1
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 97800865af861f6598a3e74456deef1d0c355786..2c2b57bbe420a84ed6a9c7250d4246e677bcf88a 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector {
 
  private:
   gpuStream_t stream_;
-  std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
+  std::unique_ptr<platform::StreamCallbackManager<gpuStream_t>>
+      callback_manager_;
 };
 
 class CUDAPinnedGarbageCollector : public GarbageCollector {
@@ -130,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
 };
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUDefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place,
+                                   size_t max_memory_size);
+
+  void Wait() const override;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class NPUUnsafeFastGarbageCollector : public GarbageCollector {
+ public:
+  NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place,
+                                size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 template <typename Container>
 void GarbageCollector::Add(Container &&objs) {
   Add(std::forward<Container>(objs), []() {});
diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h
index 8f52235c962445b2d493f79d178a8e404afdf343..7e5bf138d9fa9270eef7b19e0b350301a2290ab7 100644
--- a/paddle/fluid/framework/heter_service.h
+++ b/paddle/fluid/framework/heter_service.h
@@ -30,10 +30,12 @@ limitations under the License. */
 #include "brpc/controller.h"
 #include "brpc/server.h"
 #include "paddle/fluid/platform/timer.h"
+#endif
 
 namespace paddle {
 namespace framework {
 
+#ifdef PADDLE_WITH_PSLIB
 typedef std::function<int(const HeterRequest*, HeterResponse*)>
     HeterServiceHandler;
 class DataFeed;
@@ -70,308 +72,7 @@ class HeterXpuService : public HeterService {
   std::unordered_map<int, HeterServiceHandler> handler_map_;
 };
 
-enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE };
-
-class HeterTask {
- public:
-  void Update() {
-    if (state_ == PULL_SPARSE) {
-      state_ = OP_RUN;
-    } else if (state_ == OP_RUN) {
-      state_ = XPU;
-      // state_ = PUSH_GRAD;
-      // state_ = PUSH_GRAD;
-    } else if (state_ == XPU) {
-      state_ = OP_RUN_END;
-    } else if (state_ == OP_RUN_END) {
-      state_ = PUSH_GRAD;
-    } else if (state_ == PUSH_GRAD) {
-      state_ = DONE;
-    }
-  }
-  void Reset() {
-    total_time = 0;
-    read_time = 0;
-    pack_time = 0;
-    pull_sparse_local_time = 0;
-    op_all_time = 0;
-    xpu_op_time = 0;
-    xpu_wait_time = 0;
-    cpu_op_time = 0;
-    collect_label_time = 0;
-    fill_sparse_time = 0;
-    push_sparse_time = 0;
-    gpu_2_cpu_time = 0;
-    cpu_2_gpu_time = 0;
-    timeline.Reset();
-  }
-  void Show() {
-    std::cout << "features size " << features_.size() << std::endl;
-    for (size_t i = 0; i < features_.size(); ++i) {
-      std::cout << "features[" << i << "] size " << features_[i].size()
-                << std::endl;
-    }
-  }
-  void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch,
-                const ProgramDesc& program);
-  void PackGpuTask(Scope* thread_scope, DataFeed* reader,
-                   const ProgramDesc& program);
-
-  Scope* scope_{nullptr};
-  int taskid_;
-  int cur_batch_;
-  HeterTaskState state_;
-  // cache
-  std::map<uint64_t, std::vector<uint64_t>> features_;
-  std::map<uint64_t, std::vector<float>> feature_labels_;
-  std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
-  std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
-  std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
-  double total_time{0};
-  double read_time{0};
-  double pack_time{0};
-  double pull_sparse_local_time{0};
-  double op_all_time{0};
-  double xpu_op_time{0};
-  double xpu_wait_time{0};
-  double cpu_op_time{0};
-  double collect_label_time{0};
-  double fill_sparse_time{0};
-  double push_sparse_time{0};
-  double gpu_2_cpu_time{0};
-  double cpu_2_gpu_time{0};
-  platform::Timer timeline;
-};
-
-template <class T>
-class HeterObjectPool {
- public:
-  HeterObjectPool() {}
-  virtual ~HeterObjectPool(){};
-  std::shared_ptr<T> Get() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (pool_.empty()) {
-      num_ += 1;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      VLOG(0) << "pool construct size: " << num_;
 #endif
-      return std::make_shared<T>();
-    } else {
-      auto ret = pool_.back();
-      pool_.pop_back();
-      return ret;
-    }
-  }
-  void Push(std::shared_ptr<T> data) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    pool_.push_back(std::move(data));
-  }
-  int Size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return pool_.size();
-  }
-  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
-
- private:
-  std::vector<std::shared_ptr<T>> pool_;
-  std::mutex mutex_;
-  int num_{0};
-};
-
-struct BthreadMutextGuard {
-  BthreadMutextGuard(bthread_mutex_t* rho) {
-    mutex_ = rho;
-    bthread_mutex_lock(mutex_);
-  }
-  ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); }
-  bthread_mutex_t* mutex_;
-};
-
-template <class T>
-class BtObjectPool {
- public:
-  BtObjectPool() {
-    bthread_mutex_init(&mutex_, NULL);
-    bthread_cond_init(&cond_, NULL);
-  }
-
-  virtual ~BtObjectPool() {
-    bthread_cond_destroy(&cond_);
-    bthread_mutex_destroy(&mutex_);
-  };
-
-  std::shared_ptr<T> Get() {
-    BthreadMutextGuard guard(&mutex_);
-    while (pool_.empty()) {
-      bthread_cond_wait(&cond_, &mutex_);
-    }
-    auto ret = pool_.back();
-    pool_.pop_back();
-    return ret;
-  }
-
-  void Push(std::shared_ptr<T> data) {
-    BthreadMutextGuard guard(&mutex_);
-    pool_.push_back(std::move(data));
-    bthread_cond_signal(&cond_);
-  }
-
-  int Size() { return pool_.size(); }
-
-  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
-
- private:
-  std::vector<std::shared_ptr<T>> pool_;
-  bthread_mutex_t mutex_;
-  bthread_cond_t cond_;
-  int num_{0};
-};
-
-template <class K, class T>
-struct HeterNode {
-  K key;
-  T value;
-  HeterNode* prev;
-  HeterNode* next;
-};
-
-template <class K, class T>
-class HeterList {
- public:
-  HeterList() : head_(new HeterNode<K, T>), tail_(new HeterNode<K, T>) {
-    head_->prev = NULL;
-    head_->next = tail_;
-    tail_->prev = head_;
-    tail_->next = NULL;
-    size = 0;
-    cap_ = 1e9;
-  }
-
-  ~HeterList() {
-    delete head_;
-    delete tail_;
-  }
-
-  void SetCap(int num) { cap_ = num; }
-
-  bool TryPut(K& key, T& value) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond_.wait(lock, [this] { return size < cap_; });
-    if (task_map_.find(key) != task_map_.end()) {
-      // std::cout << "try put key=" << key << " false" << std::endl;
-      task_map_.erase(key);
-      return false;
-    } else {
-      HeterNode<K, T>* node = new HeterNode<K, T>;
-      node->key = key;
-      node->value = value;
-      map_[node->key] = node;
-      attach(node);
-      // std::cout << "try put key=" << key << " true" << std::endl;
-      return true;
-    }
-  }
-
-  bool Put(K& key, T& value) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond_.wait(lock, [this] { return size < cap_; });
-    HeterNode<K, T>* node = new HeterNode<K, T>;
-    // std::cout << "put key=" << key << " true" << std::endl;
-    node->key = key;
-    node->value = value;
-    map_[node->key] = node;
-    attach(node);
-    return true;
-  }
-
-  T TryGet(const K& key) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto iter = map_.find(key);
-    if (iter != map_.end()) {
-      // std::cout << "try get key=" << key << " true" << std::endl;
-      HeterNode<K, T>* node = iter->second;
-      detach(node);
-      cond_.notify_one();
-      T ret = std::move(node->value);
-      map_.erase(key);
-      delete node;
-      return ret;
-    }
-    task_map_.insert(key);
-    // std::cout << "try get key=" << key << " false" << std::endl;
-    return nullptr;
-  }
-
-  T Get(const K& key) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    auto iter = map_.find(key);
-    if (iter != map_.end()) {
-      // std::cout << "get key=" << key << " true" << std::endl;
-      HeterNode<K, T>* node = iter->second;
-      detach(node);
-      cond_.notify_one();
-      T ret = std::move(node->value);
-      map_.erase(key);
-      delete node;
-      return ret;
-    }
-    // std::cout << "get key=" << key << " false" << std::endl;
-    return nullptr;
-  }
-
-  T Get() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    HeterNode<K, T>* node = head_->next;
-    if (node == tail_) {
-      // std::cout << "get2 false" << std::endl;
-      return nullptr;
-    } else {
-      detach(node);
-      cond_.notify_one();
-      T ret = std::move(node->value);
-      map_.erase(node->key);
-      // std::cout << "get2 key=" << node->key << " true" << std::endl;
-      delete node;
-      return ret;
-    }
-  }
-
-  bool Empty() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return head_->next == tail_;
-  }
-
-  int Size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return size;
-  }
-
- private:
-  void detach(HeterNode<K, T>* node) {
-    node->prev->next = node->next;
-    node->next->prev = node->prev;
-    size--;
-  }
-
-  void attach(HeterNode<K, T>* node) {
-    node->prev = head_;
-    node->next = head_->next;
-    head_->next->prev = node;
-    head_->next = node;
-    size++;
-  }
-
- private:
-  HeterNode<K, T>* head_;
-  HeterNode<K, T>* tail_;
-  std::unordered_map<K, HeterNode<K, T>*> map_;
-  std::unordered_set<K> task_map_;
-  std::mutex mutex_;
-  std::condition_variable cond_;
-  int cap_;
-  int size;
-};
 
 }  // namespace framework
 }  // namespace paddle
-#endif
diff --git a/paddle/fluid/framework/heter_util.h b/paddle/fluid/framework/heter_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..a08f08428da346b4338ff4d7b8cc16e25118f909
--- /dev/null
+++ b/paddle/fluid/framework/heter_util.h
@@ -0,0 +1,329 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_PSLIB
+#include <fstream>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <thread>         // NOLINT
+#include <unordered_map>  // NOLINT
+#include <unordered_set>  // NOLINT
+#include <vector>
+#include "bthread/bthread.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/timer.h"
+
+namespace paddle {
+namespace framework {
+class DataFeed;
+enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE };
+
+class HeterTask {
+ public:
+  HeterTask() {}
+  virtual ~HeterTask(){};
+
+  void Update() {
+    if (state_ == PULL_SPARSE) {
+      state_ = OP_RUN;
+    } else if (state_ == OP_RUN) {
+      state_ = XPU;
+      // state_ = PUSH_GRAD;
+      // state_ = PUSH_GRAD;
+    } else if (state_ == XPU) {
+      state_ = OP_RUN_END;
+    } else if (state_ == OP_RUN_END) {
+      state_ = PUSH_GRAD;
+    } else if (state_ == PUSH_GRAD) {
+      state_ = DONE;
+    }
+  }
+  void Reset() {
+    total_time = 0;
+    read_time = 0;
+    pack_time = 0;
+    pull_sparse_local_time = 0;
+    op_all_time = 0;
+    xpu_op_time = 0;
+    xpu_wait_time = 0;
+    cpu_op_time = 0;
+    collect_label_time = 0;
+    fill_sparse_time = 0;
+    push_sparse_time = 0;
+    gpu_2_cpu_time = 0;
+    cpu_2_gpu_time = 0;
+    timeline.Reset();
+  }
+  void Show() {
+    std::cout << "features size " << features_.size() << std::endl;
+    for (size_t i = 0; i < features_.size(); ++i) {
+      std::cout << "features[" << i << "] size " << features_[i].size()
+                << std::endl;
+    }
+  }
+  void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch,
+                const ProgramDesc& program);
+  void PackGpuTask(Scope* thread_scope, DataFeed* reader,
+                   const ProgramDesc& program);
+
+  Scope* scope_{nullptr};
+  int taskid_;
+  int cur_batch_;
+  HeterTaskState state_;
+  // cache
+  std::map<uint64_t, std::vector<uint64_t>> features_;
+  std::map<uint64_t, std::vector<float>> feature_labels_;
+  std::map<uint64_t, std::vector<std::vector<float>>> feature_values_;
+  std::map<uint64_t, std::vector<std::vector<float>>> feature_grads_;
+  std::map<uint64_t, std::vector<uint64_t>> sparse_push_keys_;
+  double total_time{0};
+  double read_time{0};
+  double pack_time{0};
+  double pull_sparse_local_time{0};
+  double op_all_time{0};
+  double xpu_op_time{0};
+  double xpu_wait_time{0};
+  double cpu_op_time{0};
+  double collect_label_time{0};
+  double fill_sparse_time{0};
+  double push_sparse_time{0};
+  double gpu_2_cpu_time{0};
+  double cpu_2_gpu_time{0};
+  platform::Timer timeline;
+};
+#endif
+template <class T>
+class HeterObjectPool {
+ public:
+  HeterObjectPool() {}
+  virtual ~HeterObjectPool(){};
+  std::shared_ptr<T> Get() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (pool_.empty()) {
+      num_ += 1;
+      return std::make_shared<T>();
+    } else {
+      auto ret = pool_.back();
+      pool_.pop_back();
+      return ret;
+    }
+  }
+  void Push(std::shared_ptr<T> data) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    pool_.push_back(std::move(data));
+  }
+  int Size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return pool_.size();
+  }
+  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
+
+ private:
+  std::vector<std::shared_ptr<T>> pool_;
+  std::mutex mutex_;
+  int num_{0};
+};
+
+#ifdef PADDLE_WITH_PSLIB
+struct BthreadMutextGuard {
+  BthreadMutextGuard(bthread_mutex_t* rho) {
+    mutex_ = rho;
+    bthread_mutex_lock(mutex_);
+  }
+  ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); }
+  bthread_mutex_t* mutex_;
+};
+
+template <class T>
+class BtObjectPool {
+ public:
+  BtObjectPool() {
+    bthread_mutex_init(&mutex_, NULL);
+    bthread_cond_init(&cond_, NULL);
+  }
+
+  virtual ~BtObjectPool() {
+    bthread_cond_destroy(&cond_);
+    bthread_mutex_destroy(&mutex_);
+  };
+
+  std::shared_ptr<T> Get() {
+    BthreadMutextGuard guard(&mutex_);
+    while (pool_.empty()) {
+      bthread_cond_wait(&cond_, &mutex_);
+    }
+    auto ret = pool_.back();
+    pool_.pop_back();
+    return ret;
+  }
+
+  void Push(std::shared_ptr<T> data) {
+    BthreadMutextGuard guard(&mutex_);
+    pool_.push_back(std::move(data));
+    bthread_cond_signal(&cond_);
+  }
+
+  int Size() { return pool_.size(); }
+
+  std::shared_ptr<T>& GetElement(int i) { return pool_[i]; }
+
+ private:
+  std::vector<std::shared_ptr<T>> pool_;
+  bthread_mutex_t mutex_;
+  bthread_cond_t cond_;
+  int num_{0};
+};
+
+template <class K, class T>
+struct HeterNode {
+  K key;
+  T value;
+  HeterNode* prev;
+  HeterNode* next;
+};
+
+template <class K, class T>
+class HeterList {
+ public:
+  HeterList() : head_(new HeterNode<K, T>), tail_(new HeterNode<K, T>) {
+    head_->prev = NULL;
+    head_->next = tail_;
+    tail_->prev = head_;
+    tail_->next = NULL;
+    size = 0;
+    cap_ = 1e9;
+  }
+
+  ~HeterList() {
+    delete head_;
+    delete tail_;
+  }
+
+  void SetCap(int num) { cap_ = num; }
+
+  bool TryPut(K& key, T& value) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond_.wait(lock, [this] { return size < cap_; });
+    if (task_map_.find(key) != task_map_.end()) {
+      task_map_.erase(key);
+      return false;
+    } else {
+      HeterNode<K, T>* node = new HeterNode<K, T>;
+      node->key = key;
+      node->value = value;
+      map_[node->key] = node;
+      attach(node);
+      return true;
+    }
+  }
+
+  bool Put(K& key, T& value) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    cond_.wait(lock, [this] { return size < cap_; });
+    HeterNode<K, T>* node = new HeterNode<K, T>;
+    node->key = key;
+    node->value = value;
+    map_[node->key] = node;
+    attach(node);
+    return true;
+  }
+
+  T TryGet(const K& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto iter = map_.find(key);
+    if (iter != map_.end()) {
+      HeterNode<K, T>* node = iter->second;
+      detach(node);
+      cond_.notify_one();
+      T ret = std::move(node->value);
+      map_.erase(key);
+      delete node;
+      return ret;
+    }
+    task_map_.insert(key);
+    return nullptr;
+  }
+
+  T Get(const K& key) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto iter = map_.find(key);
+    if (iter != map_.end()) {
+      HeterNode<K, T>* node = iter->second;
+      detach(node);
+      cond_.notify_one();
+      T ret = std::move(node->value);
+      map_.erase(key);
+      delete node;
+      return ret;
+    }
+    return nullptr;
+  }
+
+  T Get() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    HeterNode<K, T>* node = head_->next;
+    if (node == tail_) {
+      return nullptr;
+    } else {
+      detach(node);
+      cond_.notify_one();
+      T ret = std::move(node->value);
+      map_.erase(node->key);
+      delete node;
+      return ret;
+    }
+  }
+
+  bool Empty() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return head_->next == tail_;
+  }
+
+  int Size() {
+    std::lock_guard<std::mutex> lock(mutex_);
+    return size;
+  }
+
+ private:
+  void detach(HeterNode<K, T>* node) {
+    node->prev->next = node->next;
+    node->next->prev = node->prev;
+    size--;
+  }
+
+  void attach(HeterNode<K, T>* node) {
+    node->prev = head_;
+    node->next = head_->next;
+    head_->next->prev = node;
+    head_->next = node;
+    size++;
+  }
+
+ private:
+  HeterNode<K, T>* head_;
+  HeterNode<K, T>* tail_;
+  std::unordered_map<K, HeterNode<K, T>*> map_;
+  std::unordered_set<K> task_map_;
+  std::mutex mutex_;
+  std::condition_variable cond_;
+  int cap_;
+  int size;
+};
+}  // namespace framework
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/framework/heterbox_worker.cc b/paddle/fluid/framework/heterbox_worker.cc
index 726b651fcf4ec7409eee7d1893803ef67d87db7f..b7df88218cbd4dd9018e49d709922cca3b287678 100644
--- a/paddle/fluid/framework/heterbox_worker.cc
+++ b/paddle/fluid/framework/heterbox_worker.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc
index 5e1fabf2038cc26d4da555b712cbb3199854d686..8049a1c9424bebf271f55c1247f1277a0836d88d 100644
--- a/paddle/fluid/framework/heterxpu_trainer.cc
+++ b/paddle/fluid/framework/heterxpu_trainer.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/fleet_wrapper.h"
+#include "paddle/fluid/framework/fleet/heter_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
     (defined PADDLE_WITH_PSLIB)
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index 5de8d0fcdde91993d69c16cea805a7e4042c9645..845cdbdb2500370d41a63eeb3209b347c414ac10 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <ctime>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
@@ -38,6 +39,9 @@ void HogwildWorker::Initialize(const TrainerDesc &desc) {
   for (int i = 0; i < param_.stat_var_names_size(); ++i) {
     stat_var_name_map_[param_.stat_var_names(i)] = 1;
   }
+#ifdef PADDLE_WITH_HETERPS
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
+#endif
 }
 
 void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) {
@@ -149,6 +153,9 @@ void HogwildWorker::TrainFilesWithProfiler() {
       VLOG(3) << "Going to run op " << op_name[i];
       if (!need_skip) {
         ops_[i]->Run(*thread_scope_, place_);
+#ifdef PADDLE_WITH_HETERPS
+        dev_ctx_->Wait();
+#endif
       }
       VLOG(3) << "Op " << op_name[i] << " Finished";
       timeline.Pause();
@@ -166,6 +173,16 @@ void HogwildWorker::TrainFilesWithProfiler() {
     total_inst += cur_batch;
     ++batch_cnt;
     PrintFetchVars();
+#ifdef PADDLE_WITH_HETERPS
+    dev_ctx_->Wait();
+    VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time
+            << " seconds, ins_num: " << total_inst;
+    for (size_t i = 0; i < op_name.size(); ++i) {
+      VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i]
+              << ", mean time: " << op_total_time[i] / total_inst
+              << "s, totol time:" << op_total_time[i] << "sec";
+    }
+#else
     if (thread_id_ == 0) {
       if (batch_cnt > 0 && batch_cnt % 100 == 0) {
         for (size_t i = 0; i < ops_.size(); ++i) {
@@ -177,6 +194,7 @@ void HogwildWorker::TrainFilesWithProfiler() {
         fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time);
       }
     }
+#endif
     thread_scope_->DropKids();
     timeline.Start();
   }
@@ -194,8 +212,10 @@ void HogwildWorker::TrainFilesWithProfiler() {
 
 void HogwildWorker::TrainFiles() {
   platform::SetNumThreads(1);
-    
-  std::cerr << "1!!!!!" << std::endl;
+  platform::Timer timeline;
+  timeline.Start();
+
+  int total_ins_num = 0;
   // how to accumulate fetched values here
   device_reader_->Start();
   int cur_batch;
@@ -215,10 +235,13 @@ void HogwildWorker::TrainFiles() {
       }
     }
 
+    total_ins_num += cur_batch;
     PrintFetchVars();
     thread_scope_->DropKids();
   }
-  std::cerr << "total bacth " << i << std::endl;
+  timeline.Pause();
+  VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec()
+          << " seconds, ins_num: " << total_ins_num;
 #if defined PADDLE_WITH_PSCORE
   if (thread_barrier_) {
     paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement();
@@ -230,14 +253,32 @@ void HogwildWorker::PrintFetchVars() {
   // call count
   batch_num_++;
   int batch_per_print = fetch_config_.print_period();
-  if (thread_id_ == 0) {
-    if (batch_num_ % batch_per_print == 0) {
-      int fetch_var_num = fetch_config_.fetch_var_names_size();
-      for (int i = 0; i < fetch_var_num; ++i) {
-        platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i),
-                           fetch_config_.fetch_var_str_format(i));
+  int fetch_var_num = fetch_config_.fetch_var_names_size();
+
+  if (fetch_var_num == 0) {
+    return;
+  }
+
+  if (thread_id_ == 0 && batch_num_ % batch_per_print == 0) {
+    time_t curtime;
+    time(&curtime);
+    char mbstr[80];
+    std::strftime(mbstr, sizeof(mbstr), "%Y-%m-%d %H:%M:%S",
+                  std::localtime(&curtime));
+
+    std::stringstream ss;
+    ss << "time: [" << mbstr << "], ";
+    ss << "batch: [" << batch_num_ << "], ";
+
+    for (int i = 0; i < fetch_var_num; ++i) {
+      platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i),
+                         fetch_config_.fetch_var_str_format(i), &ss);
+      if (i < fetch_var_num - 1) {
+        ss << ", ";
       }
     }
+
+    std::cout << ss.str() << std::endl;
   }
 }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0ca78c679aecaa396b59c7d50471baee239ba622..ab69170322ce3ec4eaa8e46b53e490b634df64b7 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -86,6 +86,7 @@ pass_library(quant_conv2d_dequant_fuse_pass inference)
 pass_library(shuffle_channel_detect_pass inference)
 pass_library(delete_quant_dequant_op_pass inference)
 pass_library(delete_quant_dequant_filter_op_pass inference)
+pass_library(delete_dropout_op_pass inference)
 pass_library(simplify_with_basic_ops_pass base)
 pass_library(fc_elementwise_layernorm_fuse_pass base)
 pass_library(skip_layernorm_fuse_pass base)
diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..09962239a01b1839bea93846ca3ffe9ded3cca4e
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+
+#include "paddle/fluid/framework/ir/delete_dropout_op_pass.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                  \
+  GET_IR_NODE(any_op_out);         \
+  GET_IR_NODE(dropout_op);         \
+  GET_IR_NODE(dropout_op_out);     \
+  GET_IR_NODE(dropout_op_outmask); \
+  GET_IR_NODE(any_op2);
+
+void DeleteDropoutOpPass::ApplyImpl(ir::Graph* graph) const {
+  const std::string pattern_name = "delete_dropout_op_pattern";
+  FusePassBase::Init(pattern_name, graph);
+
+  GraphPatternDetector gpd;
+
+  patterns::DeleteDropoutOpPattern pattern(gpd.mutable_pattern(), pattern_name);
+  pattern();
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+    IR_NODE_LINK_TO(any_op_out, any_op2);
+    std::string any_op_out_name = any_op_out->Var()->Name();
+    std::string dropout_op_out_name = dropout_op_out->Var()->Name();
+
+    auto* any_op2_desc = any_op2->Op();
+    auto var_map = any_op2_desc->Inputs();
+    std::string arg_name = "";
+    for (auto& name_m : var_map) {
+      if (std::find(name_m.second.begin(), name_m.second.end(),
+                    dropout_op_out_name) != name_m.second.end()) {
+        arg_name = name_m.first;
+      }
+    }
+    if (arg_name.size() == 0) {
+      LOG(INFO) << "Delete dropout op pass: can not find the input "
+                << dropout_op_out_name;
+      return;
+    }
+
+    // modify the any_op2's inputs
+    for (auto& name_m : var_map) {
+      if (std::find(name_m.second.begin(), name_m.second.end(),
+                    dropout_op_out_name) != name_m.second.end()) {
+        std::vector<std::string> new_inputs;
+        for (auto& i_n : name_m.second) {
+          if (i_n != dropout_op_out_name) {
+            new_inputs.push_back(i_n);
+          }
+        }
+        new_inputs.push_back(any_op_out_name);
+        any_op2_desc->SetInput(name_m.first, new_inputs);
+        any_op2_desc->Flush();
+      }
+    }
+    any_op2_desc->Flush();
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph,
+                         {dropout_op, dropout_op_out, dropout_op_outmask});
+  };
+
+  gpd(graph, handler);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(delete_dropout_op_pass,
+              paddle::framework::ir::DeleteDropoutOpPass);
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.h
similarity index 62%
rename from paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
rename to paddle/fluid/framework/ir/delete_dropout_op_pass.h
index 3f3b6b959e30194c10b1a58d6fc3e7a61ad01313..c49abf3c871ced474bc47e28ec32d29bc9ccf750 100644
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.h
@@ -12,16 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
-namespace operators {
-namespace distributed {
+namespace framework {
+namespace ir {
+
+class Graph;
+
+class DeleteDropoutOpPass : public FusePassBase {
+ public:
+  virtual ~DeleteDropoutOpPass() {}
 
-std::once_flag AsyncSparseParamUpdateRecorder::init_flag_;
-std::unique_ptr<AsyncSparseParamUpdateRecorder>
-    AsyncSparseParamUpdateRecorder::recorder_(nullptr);
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
 
-}  // namespace distributed
-}  // namespace operators
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 84c6b03e76bc1efd9e7d4c34b9b6151b16bf4040..48f79e63b4f0ea51df27695943690c1c36727e93 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -34,15 +34,19 @@ namespace patterns {
 static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name,
                                const std::string& arg,
                                bool is_persist = false) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   PDNode* node =
-      pattern->NewNode(name)->assert_is_op_input("lookup_table", arg);
+      pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg);
   if (is_persist) return node->assert_is_persistable_var();
   return node;
 }
 static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name,
                                    const std::string& arg) {
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   PDNode* node = pattern->NewNode(name)
-                     ->assert_is_only_output_of_op("lookup_table")
+                     ->assert_is_only_output_of_ops(embedding_ops)
                      ->assert_is_op_input("elementwise_add", arg)
                      ->AsIntermediate();
   return node;
@@ -56,10 +60,12 @@ void Embedding2Eltwise1Pattern::operator()() {
       create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
   auto* lookup_table2_w =
       create_emb_vars(pattern, lookup_table2_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
   auto* lookup_table2 =
-      pattern->NewNode(lookup_table2_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops);
   auto* lookup_table1_out =
       create_emb_out_vars(pattern, lookup_table1_out_repr(), "X");
   auto* lookup_table2_out =
@@ -80,8 +86,10 @@ void Embedding1Eltwise1Pattern::operator()() {
       create_emb_vars(pattern, lookup_table1_x_repr(), "Ids");
   auto* lookup_table1_w =
       create_emb_vars(pattern, lookup_table1_w_repr(), "W", true);
+  std::unordered_set<std::string> embedding_ops{"lookup_table",
+                                                "lookup_table_v2"};
   auto* lookup_table1 =
-      pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table");
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops);
   auto* lookup_table1_out =
       create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y");
   auto* eltwise_add =
@@ -282,15 +290,30 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
       ids.push_back(inner_pattern_ins[js[iter]].first->Name());
       embs.push_back(inner_pattern_ins[js[iter]].second->Name());
     }
+
     OpDesc new_op_desc;
     new_op_desc.SetType("fused_embedding_eltwise_layernorm");
     new_op_desc.SetInput("Ids", ids);
     new_op_desc.SetInput("Embs", embs);
+
+    new_op_desc.SetInput("WordId", {ids[0]});
+    new_op_desc.SetInput("PosId", {ids[1]});
+    new_op_desc.SetInput("SentId", {ids[2]});
+
+    new_op_desc.SetInput("WordEmbedding", {embs[0]});
+    new_op_desc.SetInput("PosEmbedding", {embs[1]});
+    new_op_desc.SetInput("SentEmbedding", {embs[2]});
+
     new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()});
     new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()});
     new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()});
     new_op_desc.SetAttr("epsilon",
                         end_patter_layernorms[k]->Op()->GetAttr("epsilon"));
+
+    if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) {
+      new_op_desc.SetAttr("enable_int8", true);
+    }
+
     auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc);
 
     for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) {
@@ -347,4 +370,5 @@ REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("lookup_table", 0)
+            .LE("lookup_table_v2", 1)
             .EQ("elementweise_add", 0));
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index deb182c0fbe19c0ec9cb2e6f4b215b7983be3371..064da3d941602ee0e4f868fb0dbda305102da32b 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -652,6 +652,36 @@ PDNode *PDNode::assert_is_ops_input(
   return this;
 }
 
+PDNode *PDNode::assert_is_only_input_of_ops(
+    const std::unordered_set<std::string> &op_types) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node *x) {
+    for (auto *op : x->outputs) {
+      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
+          op->inputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+
+PDNode *PDNode::assert_is_only_output_of_ops(
+    const std::unordered_set<std::string> &op_types) {
+  assert_is_var();
+  asserts_.emplace_back([=](Node *x) {
+    for (auto *op : x->inputs) {
+      if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) &&
+          op->outputs.size() == 1) {
+        return true;
+      }
+    }
+    return false;
+  });
+  return this;
+}
+
 bool VarLinksToOp(Node *node, const std::string &op_type) {
   for (auto *out : node->outputs) {
     if (out->IsOp() && out->Op()->Type() == op_type) {
@@ -2409,6 +2439,29 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
   return concat_out;
 }
 
+void patterns::DeleteDropoutOpPattern::operator()() {
+  auto any_op_out = pattern->NewNode(any_op_out_repr())
+                        ->assert_is_op_input("dropout", "X")
+                        ->AsInput();
+
+  auto dropout_op =
+      pattern->NewNode(dropout_op_repr())->assert_is_op("dropout");
+
+  auto dropout_op_out = pattern->NewNode(dropout_op_out_repr())
+                            ->assert_is_op_output("dropout", "Out")
+                            ->AsIntermediate();
+
+  auto dropout_op_outmask = pattern->NewNode(dropout_op_outmask_repr())
+                                ->assert_is_op_output("dropout", "Mask")
+                                ->AsOutput();
+  auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput();
+
+  dropout_op->LinksFrom({any_op_out});
+  dropout_op_out->LinksFrom({dropout_op});
+  dropout_op_outmask->LinksFrom({dropout_op});
+  any_op2->LinksFrom({dropout_op_out});
+}
+
 void patterns::DeleteQuantOpFuse::operator()(PDNode *input_act_node,
                                              const std::string &quant_type) {
   auto *input_scale_node = pattern->NewNode(GetNodeName("input_scale_node"))
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 2e518c1d4df72aae98b21233918e335d4286b3de..13f65859954d58ce446ab3b9de488833f6220dee 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -28,7 +28,6 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/inference/analysis/dot.h"
@@ -146,6 +145,11 @@ struct PDNode {
       const std::unordered_set<std::string>& op_types,
       const std::string& argument, int nth);
 
+  PDNode* assert_is_only_input_of_ops(
+      const std::unordered_set<std::string>& op_types);
+  PDNode* assert_is_only_output_of_ops(
+      const std::unordered_set<std::string>& op_types);
+
   PDNode* assert_has_n_inputs(size_t n);
   PDNode* assert_has_n_outputs(size_t n);
 
@@ -1460,6 +1464,19 @@ struct ShuffleChannelPattern : public PatternBase {
   PATTERN_DECL_NODE(reshape2_out);
 };
 
+struct DeleteDropoutOpPattern : public PatternBase {
+  DeleteDropoutOpPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "delete_dropout_op_pattern") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(any_op_out);
+  PATTERN_DECL_NODE(dropout_op);
+  PATTERN_DECL_NODE(dropout_op_out);
+  PATTERN_DECL_NODE(dropout_op_outmask);
+  PATTERN_DECL_NODE(any_op2);
+};
+
 struct DeleteQuantDequantOpPattern : public PatternBase {
   DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {}
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 0a70440765d44d2eac39091c72d500ffb746089d..25bf03f426a1d9d77c17b26ece92943d71a9ed81 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const {
                   "hard_shrink", "hard_sigmoid", "relu6",
                   "soft_relu",   "swish",        "thresholded_relu",
                   "log",         "square",       "softplus",
-                  "softsign"};
+                  "softsign",    "silu"};
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
index 69edc3d87f97d6762079a37c920f5ece57903cfa..18d2e9817ebec857e1b13d7d6e0e9f2201a69d94 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc
@@ -14,7 +14,6 @@
 
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
index 5fd47b21733b54009954843fe02ae81f171f1554..5fe71fbc21451f13991cab4f612d251d028ac792 100644
--- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc
@@ -17,7 +17,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_test_util.h"
 #include "paddle/fluid/framework/naive_executor.h"
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
index a2443c86986ec87cc29e9897fe0d38883f8fafa1..c36123f65f6644289cfba2b2729862efa601e2fd 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@@ -57,7 +57,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
     std::vector<int64_t> y_shape = matmul_in_y->Var()->GetShape();
     size_t x_rank = x_shape.size();
     size_t y_rank = y_shape.size();
-    flag = flag && x_rank == 2 && y_rank == 2;
+    flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2;
 
     std::vector<Node*>& next_ops = matmul_out->outputs;
     flag = flag && next_ops.size() == 1 &&
@@ -69,7 +69,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
       desc.SetInput("X", {matmul_in_x->Name()});
       desc.SetInput("Y", {matmul_in_y->Name()});
       desc.SetOutput("Out", {matmul_out->Name()});
-      desc.SetAttr("x_num_col_dims", 1);
+      desc.SetAttr("x_num_col_dims", static_cast<int>(x_rank - 1));
       desc.SetAttr("y_num_col_dims", 1);
       if (matmul_op->Op()->HasAttr("enable_int8")) {
         desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8"));
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
index 3d7a9c1107bbaac04a3a478014520a9b340b1d5f..531a04e1a0d4c11799e8dea520faed447de4e808 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -53,7 +53,7 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
   gpd(graph, handler);
 }
 
-void CPUBfloat16PlacementPass::RemoveOrhanedOperators(
+void CPUBfloat16PlacementPass::RemoveOrphanedOperators(
     ir::Graph* graph, int* bfloat16_operators) const {
   // find orphaned bfloat16 operator that is between two float32 operators
   // revert mkldnn_data_type attr to float32
@@ -74,7 +74,7 @@ void CPUBfloat16PlacementPass::RemoveOrhanedOperators(
 void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const {
   int bfloat16_operators = 0;
   SetMkldnnDataType(graph, &bfloat16_operators);
-  RemoveOrhanedOperators(graph, &bfloat16_operators);
+  RemoveOrphanedOperators(graph, &bfloat16_operators);
   PrettyLogDetail("---    marked %d operators to bfloat16 ",
                   bfloat16_operators);
 }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
index 1911b1a3cb32a6a23585e8240c462aa84e8d869b..53b97f0e9726aacf86f6f71d3382ab25241e3cdb 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h
@@ -28,7 +28,7 @@ class CPUBfloat16PlacementPass : public Pass {
  protected:
   void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const;
 
-  void RemoveOrhanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
+  void RemoveOrphanedOperators(ir::Graph* graph, int* bfloat16_operators) const;
 
   void ApplyImpl(ir::Graph* graph) const override;
 };
diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
index 06df1caca35b922ac96d7d886296a6dee6bfb764..4eb532b47cb4b59cb3df0fe775400caa01354269 100644
--- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
@@ -43,8 +43,9 @@ void InterpolateMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
 
   int found_count = 0;
   const std::vector<std::string> interpolate_op_types = {
-      "bilinear_interp", "nearest_interp", "trilinear_interp", "bicubic_interp",
-      "linear_interp"};
+      "bilinear_interp",  "nearest_interp", "trilinear_interp",
+      "bicubic_interp",   "linear_interp",  "bilinear_interp_v2",
+      "nearest_interp_v2"};
 
   for (const Node* node : graph->Nodes()) {
     if (node->IsOp() &&
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index e20c0667ec3bc2834eccb2b70b0e741d1051f7ce..57bee20247c9644941f87db48406ef2b097a23fb 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
     multihead_op_desc.SetAttr("alpha", scale_attr);
     multihead_op_desc.SetAttr("head_number", head_number);
 
+    auto* mul0_op_desc = mul0->Op();
+    auto* mul1_op_desc = mul1->Op();
+    auto* mul2_op_desc = mul2->Op();
+    if (mul0_op_desc->HasAttr("enable_int8")) {
+      multihead_op_desc.SetAttr("enable_int8",
+                                mul0_op_desc->GetAttr("enable_int8"));
+      // all mul op has same input.
+      multihead_op_desc.SetAttr("Input_scale",
+                                mul0_op_desc->GetAttr("X_scale"));
+      auto weight_scale0 = BOOST_GET_CONST(
+          std::vector<float>, mul0_op_desc->GetAttr("weight_scale"));
+      auto weight_scale1 = BOOST_GET_CONST(
+          std::vector<float>, mul1_op_desc->GetAttr("weight_scale"));
+      auto weight_scale2 = BOOST_GET_CONST(
+          std::vector<float>, mul2_op_desc->GetAttr("weight_scale"));
+      auto weight_max = std::max(weight_scale0, weight_scale1);
+      weight_max = std::max(weight_max, weight_scale2);
+      multihead_op_desc.SetAttr("weight_scale", weight_max);
+
+      if (mul0_op_desc->HasAttr("out_threshold")) {
+        auto out_scale0 =
+            BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold"));
+        auto out_scale1 =
+            BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold"));
+        auto out_scale2 =
+            BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold"));
+        auto out_scale_max = std::max(out_scale0, out_scale1);
+        out_scale_max = std::max(out_scale_max, out_scale2);
+        multihead_op_desc.SetAttr("out_threshold", out_scale_max);
+      }
+    }
+
     auto* multihead = graph->CreateOpNode(&multihead_op_desc);
 
     IR_NODE_LINK_TO(input0, multihead);
@@ -682,6 +714,447 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope,
   return fusion_count;
 }
 
+PDNode* MultiHeadMatmulV3Pattern::operator()() {
+  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("matmul");
+
+  // First path with scale
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul");
+  auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto* mul0_out_var =
+      pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul");
+
+  decltype(mul0) eltadd0;
+  decltype(mul0) eltadd0_b_var;
+  decltype(mul0) eltadd0_out_var;
+
+  mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_0_out_var =
+      pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2");
+  reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul", "X");
+
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add");
+  eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var =
+      pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
+  softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
+
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
+                                   ->assert_is_op_output("reshape2");
+  reshape2_qkv_out_var->assert_is_op_input("matmul");
+
+  // Second path to matmul
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul");
+  auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto* mul1_out_var =
+      pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul");
+
+  decltype(mul1) eltadd1;
+  decltype(mul1) eltadd1_b_var;
+  decltype(mul1) eltadd1_out_var;
+
+  mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
+  eltadd1_b_var = pattern->NewNode(eltadd1_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd1_out_var = pattern->NewNode(eltadd1_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_1 =
+      pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_1_out_var =
+      pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2");
+  reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_1 =
+      pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
+  auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
+      "matmul", "Y");  // link to matmul qk
+
+  // Third path to matmul
+  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul");
+  auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("matmul", "Y");
+  auto* mul2_out_var =
+      pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul");
+
+  decltype(mul2) eltadd2;
+  decltype(mul2) eltadd2_b_var;
+  decltype(mul2) eltadd2_out_var;
+
+  mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+  eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
+  eltadd2_b_var = pattern->NewNode(eltadd2_b_repr())
+                      ->AsInput()
+                      ->assert_is_op_input("elementwise_add", "Y");
+
+  eltadd2_out_var = pattern->NewNode(eltadd2_out_repr())
+                        ->assert_is_op_output("elementwise_add");
+  eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_2 =
+      pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2");
+
+  auto* reshape2_2_out_var =
+      pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2");
+  reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_2 =
+      pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
+  auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
+      matmul_ops);  // link to matmul qkv
+
+  // Q path
+  mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
+  eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var});
+
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  // K path
+  mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var});
+  eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var});
+  reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var});
+  transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var});
+  // compute q*k
+  matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+  // V  path
+  mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var});
+  eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var});
+  reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var});
+  transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var});
+  // compute q*k*v
+  matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+
+  return transpose2_2_out_var;
+}
+
+static int BuildFusionV3(Graph* graph, const std::string& name_scope,
+                         Scope* scope) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  MultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope);
+
+  multihead_pattern();
+  // Create New OpDesc
+  auto fuse_creater = [&](
+      Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out,
+      Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w,
+      Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b,
+      Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) {
+    auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha"));
+
+    // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H)
+    // bias (B * S * 3 * N * H) + bias (3 * N * H)
+    // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H)
+    auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable<LoDTensor>();
+    auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable<LoDTensor>();
+    auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable<LoDTensor>();
+
+    auto* bq_tensor =
+        scope->FindVar(eltadd0_b->Name())->GetMutable<LoDTensor>();
+    auto* bk_tensor =
+        scope->FindVar(eltadd1_b->Name())->GetMutable<LoDTensor>();
+    auto* bv_tensor =
+        scope->FindVar(eltadd2_b->Name())->GetMutable<LoDTensor>();
+
+    auto* wq_data = wq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wk_data = wk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* wv_data = wv_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bq_data = bq_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bk_data = bk_tensor->mutable_data<float>(platform::CPUPlace());
+    auto* bv_data = bv_tensor->mutable_data<float>(platform::CPUPlace());
+
+    auto combined_w_dims =
+        framework::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    auto combined_bias_dims = framework::make_ddim({3, bq_tensor->dims()[0]});
+
+    // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
+    auto* combined_w_desc = mul0_w->Var();
+    combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]});
+    combined_w_desc->SetPersistable(true);
+
+    auto* combined_bias_desc = eltadd0_b->Var();
+    combined_bias_desc->SetShape({3, bq_tensor->dims()[0]});
+    combined_bias_desc->SetPersistable(true);
+
+    framework::LoDTensor tmp_combined_w_tensor;
+    tmp_combined_w_tensor.Resize(combined_w_dims);
+    auto* tmp_combined_w_data =
+        tmp_combined_w_tensor.mutable_data<float>(platform::CPUPlace());
+
+    std::vector<float*> w_vec = {wq_data, wk_data, wv_data};
+    int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2];
+    // Combine the three fc weights together.
+    for (int i = 0; i < dims_h; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < dims_w; k++) {
+          int out_index = i * (3 * dims_w) + j * dims_w + k;
+          int in_index = i * dims_w + k;
+          tmp_combined_w_data[out_index] = w_vec[j][in_index];
+        }
+      }
+    }
+
+    wq_tensor->Resize(combined_w_dims);
+    auto* new_combined_w_data =
+        wq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_w_data, tmp_combined_w_data,
+           sizeof(float) * wq_tensor->numel());
+
+    scope->EraseVars({mul1_w->Name(), mul2_w->Name()});
+
+    framework::LoDTensor tmp_combined_bias_tensor;
+    tmp_combined_bias_tensor.Resize(combined_bias_dims);
+    auto* tmp_combined_bias_data =
+        tmp_combined_bias_tensor.mutable_data<float>(platform::CPUPlace());
+
+    size_t bias_size = bq_tensor->numel();
+    memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + bias_size, bk_data,
+           sizeof(float) * bias_size);
+    memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data,
+           sizeof(float) * bias_size);
+
+    bq_tensor->Resize(combined_bias_dims);
+    auto* new_combined_bias_data =
+        bq_tensor->mutable_data<float>(platform::CPUPlace());
+    memcpy(new_combined_bias_data, tmp_combined_bias_data,
+           sizeof(float) * bq_tensor->numel());
+
+    scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()});
+
+    auto reshape_desc = reshape2->Op();
+    int head_number =
+        BOOST_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape")).at(2);
+
+    OpDesc multihead_op_desc;
+    multihead_op_desc.SetType("multihead_matmul");
+
+    multihead_op_desc.SetInput("Input", {input0->Name()});
+    multihead_op_desc.SetInput("W", {mul0_w->Name()});
+    multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()});
+    multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()});
+
+    multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()});
+    multihead_op_desc.SetAttr("alpha", scale_attr);
+    multihead_op_desc.SetAttr("head_number", head_number);
+
+    auto* multihead = graph->CreateOpNode(&multihead_op_desc);
+
+    IR_NODE_LINK_TO(input0, multihead);
+    IR_NODE_LINK_TO(mul0_w, multihead);
+    IR_NODE_LINK_TO(eltadd0_b, multihead);
+    IR_NODE_LINK_TO(eltadd_qk_b, multihead);
+
+    IR_NODE_LINK_TO(multihead, reshape2_qkv_out);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out,
+                              multihead_pattern);
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out,
+                              multihead_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv,
+                              multihead_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out,
+                              multihead_pattern);
+
+    // If weights or biases in qkv's fc are shared by multiple multihead_matmul
+    // patterns, we do not support this kind of fusion, this pass will not take
+    // effect.
+    bool is_fc_params_shared =
+        mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 ||
+        mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 ||
+        eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1;
+    if (is_fc_params_shared) {
+      return;
+    }
+    fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w,
+                 mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b,
+                 reshape2_0, reshape2_qkv_out, matmul_qk);
+
+    std::unordered_set<const Node*> marked_nodes({eltadd0,
+                                                  eltadd1,
+                                                  eltadd2,
+                                                  eltadd1_b,
+                                                  eltadd2_b,
+                                                  eltadd0_out,
+                                                  eltadd1_out,
+                                                  eltadd2_out,
+                                                  reshape2_0,
+                                                  reshape2_1,
+                                                  reshape2_2,
+                                                  reshape2_0_out,
+                                                  reshape2_1_out,
+                                                  reshape2_2_out,
+                                                  transpose2_0,
+                                                  transpose2_1,
+                                                  transpose2_2,
+                                                  transpose2_0_out,
+                                                  transpose2_1_out,
+                                                  transpose2_2_out,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  mul0,
+                                                  mul1,
+                                                  mul2,
+                                                  mul0_out,
+                                                  mul1_out,
+                                                  mul2_out,
+                                                  mul1_w,
+                                                  mul2_w,
+                                                  reshape2_qkv});
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
 }  // namespace patterns
 
 void MultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
@@ -706,6 +1179,21 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
   AddStatis(fusion_count);
 }
 
+void MultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal(
+          "During the multiheadMatmul pass, The scope should not be null."));
+
+  int fusion_count = patterns::BuildFusionV3(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kMultiheadMatmulPass, new bool(true));
+  }
+  AddStatis(fusion_count);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
@@ -715,6 +1203,8 @@ REGISTER_PASS(multihead_matmul_fuse_pass,
 
 REGISTER_PASS(multihead_matmul_fuse_pass_v2,
               paddle::framework::ir::MultiHeadMatmulV2FusePass);
+REGISTER_PASS(multihead_matmul_fuse_pass_v3,
+              paddle::framework::ir::MultiHeadMatmulV3FusePass);
 REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
@@ -725,3 +1215,13 @@ REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2)
             .EQ("scale", 0)
             .LE("matmul", 1)
             .EQ("softmax", 0));
+REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v3)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
index f5327dc71080be9edff30855a157465e0b35712a..c7f1336211d3463846a61b998c4f12f11095de32 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h
@@ -89,9 +89,63 @@ struct MultiHeadMatmulPattern : public PatternBase {
   PATTERN_DECL_NODE(matmul_qkv);
   PATTERN_DECL_NODE(matmul_qkv_out);
 };
+
+struct MultiHeadMatmulV3Pattern : public PatternBase {
+  MultiHeadMatmulV3Pattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "multihead_matmul_v3") {}
+
+  PDNode* operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(mul0);
+  PATTERN_DECL_NODE(mul1);
+  PATTERN_DECL_NODE(mul2);
+  PATTERN_DECL_NODE(mul0_w);
+  PATTERN_DECL_NODE(mul1_w);
+  PATTERN_DECL_NODE(mul2_w);
+  PATTERN_DECL_NODE(mul0_out);
+  PATTERN_DECL_NODE(mul1_out);
+  PATTERN_DECL_NODE(mul2_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(eltadd1_out);
+  PATTERN_DECL_NODE(eltadd2_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_1);
+  PATTERN_DECL_NODE(reshape2_2);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(reshape2_1_out);
+  PATTERN_DECL_NODE(reshape2_2_out);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_1);
+  PATTERN_DECL_NODE(transpose2_2);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_0_out);
+  PATTERN_DECL_NODE(transpose2_1_out);
+  PATTERN_DECL_NODE(transpose2_2_out);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+};
+
 }  // namespace patterns
 
-// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op.
 class MultiHeadMatmulFusePass : public FusePassBase {
  public:
   virtual ~MultiHeadMatmulFusePass() {}
@@ -112,6 +166,16 @@ class MultiHeadMatmulV2FusePass : public FusePassBase {
   const std::string name_scope_{"multihead_matmul_fuse_v2"};
 };
 
+class MultiHeadMatmulV3FusePass : public FusePassBase {
+ public:
+  virtual ~MultiHeadMatmulV3FusePass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"multihead_matmul_fuse_v3"};
+};
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc
index fd604ffe7b5de440fb3509a01fd2a1bc1a553574..35ba92006077999a541e700c6884db0d32f0bfab 100644
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ b/paddle/fluid/framework/ir/placement_pass_base.cc
@@ -77,7 +77,8 @@ bool PlacementPassBase::IsDefaultOpTypes(const std::string& op_type) const {
     // the corresponding pass.
     const std::vector<std::string> not_default_op_types = {
         "bilinear_interp", "nearest_interp", "trilinear_interp",
-        "bicubic_interp", "linear_interp"};
+        "bicubic_interp",  "linear_interp",  "bilinear_interp_v2",
+        "linear_interp_v2"};
     bool is_interpolate_op =
         std::find(not_default_op_types.begin(), not_default_op_types.end(),
                   op_type) != not_default_op_types.end();
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 5043fce8885cdefc24b52c0321d83b411ccd5db4..2fc39fd25d56c18ac510b550186eccaeb6eb9030 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -225,10 +225,13 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
                quantized_op_type == "depthwise_conv2d") {
       PADDLE_ENFORCE_EQ(
           dequant_type, "fake_channel_wise_dequantize_max_abs",
-          platform::errors::InvalidArgument("conv2d op must be dequantized by "
-                                            "[fake_channel_wise_dequantize_max_"
-                                            "abs], but got %s",
-                                            dequant_type));
+          platform::errors::InvalidArgument(
+              "conv2d op must be dequantized by "
+              "[fake_channel_wise_dequantize_max_abs], but got %s. "
+              "If you uses PaddleSlim to generate the quantized "
+              "model, please set the 'weight_quantize_type' params as "
+              "'channel_wise_abs_max' and generate the quantized model again.",
+              dequant_type));
       PADDLE_ENFORCE_EQ(
           weight_scale.size(), static_cast<size_t>(w_dims[0]),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index 479df876fbe007119c55261dd149bd515b0cd117..bf59c140005167e3be342b4039d2b13e5bddf1c6 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -54,6 +54,17 @@ static bool IsFCWithAct(Node* n, const std::string& act_type = "relu") {
   return false;
 }
 
+static bool IsFCWithPaddingWeights(Node* n) {
+  bool res = false;
+  if (n && n->IsOp() && n->Op() && n->Op()->Type() == "fc" &&
+      n->inputs.size() == 3U && n->outputs.size() == 1U) {
+    if (n->Op()->HasAttr("padding_weights")) {
+      res = BOOST_GET_CONST(bool, n->Op()->GetAttr("padding_weights"));
+    }
+  }
+  return res;
+}
+
 static bool IsParamOfFC(Node* n, const std::string& param_name) {
   if (IsInputOfFC(n) && n->inputs.empty() &&
       (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) {
@@ -255,7 +266,7 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
 
     fc_ops[i] = pattern->NewNode(
         [=](Node* x) {
-          if (!IsFCWithAct(x, "relu")) {
+          if (!IsFCWithAct(x, "relu") || IsFCWithPaddingWeights(x)) {
             return false;
           }
           auto* fc_out_var = x->outputs[0];
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index ada20113077c18080e358849073c3703e881d262..232e1d8da4ded39df732912bc86edb9a1fb54317 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -141,14 +141,6 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
                               fused_pattern);
 
-    // check if is in ernie or not
-    if (!graph->Has(kEmbEltwiseLayernormPass) ||
-        !graph->Has(kMultiheadMatmulPass)) {
-      LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in "
-                << "Ernie/Bert model. Just skip this pass.";
-      return;
-    }
-
     std::unordered_set<const Node *> del_node_set;
 
     // Create an SkipLayerNorm op node
@@ -161,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     new_desc.SetInput("Scale", {layer_norm_scale->Name()});
     new_desc.SetInput("Bias", {layer_norm_bias->Name()});
 
+    if (elementwise->Op()->HasAttr("out_threshold")) {
+      new_desc.SetAttr("enable_int8", true);
+    }
+
     // outputs
     new_desc.SetOutput("Out", {layer_norm_out->Name()});
 
diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h
index 4307e51862df572e013431fceaaf89cc1cf6679c..8fe314cf5f18c5e8cc0a56ca8f231d32b9896aaf 100644
--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
     return LibraryType::kPlain;
   } else if (s == std::string("XPU")) {
     return LibraryType::kPlain;
+  } else if (s == std::string("NPU")) {
+    return LibraryType::kPlain;
   } else if (s == std::string("CUDA")) {
     return LibraryType::kPlain;
   } else {
diff --git a/paddle/fluid/framework/load_op_lib.h b/paddle/fluid/framework/load_op_lib.h
deleted file mode 100644
index 16cffe119d63e0cb8bd6ff76f4ac5792127f480d..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/load_op_lib.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/platform/dynload/dynamic_loader.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-T *DynLoad(void *handle, std::string name) {
-  T *func = reinterpret_cast<T *>(dlsym(handle, name.c_str()));
-#if !defined(_WIN32)
-  auto errorno = dlerror();
-#else
-  auto errorno = GetLastError();
-#endif  // !_WIN32
-  PADDLE_ENFORCE_NOT_NULL(
-      func,
-      platform::errors::NotFound(
-          "Failed to load dynamic operator library, error code(%s).", errorno));
-  return func;
-}
-
-void LoadOpLib(const std::string &dso_name) {
-  void *handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
-
-  typedef OpInfoMap &get_op_info_t();
-  get_op_info_t *get_op_info =
-      DynLoad<get_op_info_t>(handle, "PD_GetOpInfoMap");
-  auto &op_info = get_op_info();
-  auto *dyn_info_map = op_info.mutable_map();
-
-  typedef std::vector<std::string> grad_op_desc_maker_t(
-      const OpDesc &, const std::unordered_set<std::string> &,
-      std::unordered_map<std::string, std::string> *,
-      const std::vector<BlockDesc *> &);
-
-  grad_op_desc_maker_t *grad_op_desc_maker =
-      DynLoad<grad_op_desc_maker_t>(handle, "PD_GetGradOpDescStrs");
-
-  auto &info_map = OpInfoMap::Instance();
-  for (const auto &n : *(dyn_info_map)) {
-    auto type = n.first;
-    if (type == "recurrent" || type == "recurrent_grad" ||
-        type == "conditional_block" || type == "conditional_block_grad") {
-      continue;
-    }
-    PADDLE_ENFORCE_NE(info_map.Has(n.first), true,
-                      platform::errors::AlreadyExists(
-                          "Operator (%s) has been registered.", type));
-    OpInfo info;
-    info.creator_ = n.second.creator_;
-
-    // If get the protocol buffer from dynamic library directly, there
-    // will be deconstruction error
-    // ** Error in `python`: free(): invalid pointer:
-    //  ...  paddle::framework::proto::OpDesc::SharedDtor()
-    // It seems a bug in protobuf, see
-    // https://github.com/protocolbuffers/protobuf/issues/435
-    // So, get the serialized binary string from dynamic library,
-    // then deserialize to protocol buffer.
-    info.grad_op_maker_ = [grad_op_desc_maker](
-        const OpDesc &op_desc,
-        const std::unordered_set<std::string> &no_grad_set,
-        std::unordered_map<std::string, std::string> *grad_to_var,
-        const std::vector<BlockDesc *> &grad_block) {
-      std::vector<std::string> strs =
-          grad_op_desc_maker(op_desc, no_grad_set, grad_to_var, grad_block);
-      std::vector<std::unique_ptr<OpDesc>> ret;
-      for (auto &str : strs) {
-        proto::OpDesc proto_desc;
-        PADDLE_ENFORCE_EQ(proto_desc.ParseFromString(str), true,
-                          platform::errors::InvalidArgument(
-                              "Failed to parse OpDesc from string."));
-        ret.emplace_back(new OpDesc(proto_desc, nullptr));
-      }
-      return ret;
-    };
-    info.proto_ = n.second.proto_;
-    info.checker_ = n.second.checker_;
-    info.infer_var_type_ = n.second.infer_var_type_;
-    info.infer_shape_ = n.second.infer_shape_;
-    info.infer_inplace_ = n.second.infer_inplace_;
-    info.infer_no_need_buffer_vars_ = n.second.infer_no_need_buffer_vars_;
-    info.use_default_grad_op_desc_maker_ =
-        n.second.use_default_grad_op_desc_maker_;
-    info.use_empty_grad_op_desc_maker_ = n.second.use_empty_grad_op_desc_maker_;
-
-    info_map.Insert(type, info);
-  }
-
-  typedef void init_device_t(platform::DeviceContextPool *);
-  init_device_t *init_dev =
-      DynLoad<init_device_t>(handle, "PD_InitDevicesPool");
-  init_dev(&(platform::DeviceContextPool::Instance()));
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 3a79452e230ef4f340fbc5464236063193d9b28f..0a6b5e44452fe191fce5fea058194a92e3a406de 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -268,6 +268,21 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
   TensorToStream(os, static_cast<Tensor>(tensor), dev_ctx);
 }
 
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext *dev_ctx;
+  auto place = tensor.place();
+  dev_ctx = pool.Get(place);
+  SerializeToStream(os, tensor, *dev_ctx);
+}
+
+void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext *dev_ctx;
+  dev_ctx = pool.Get(platform::CPUPlace());
+  DeserializeFromStream(os, tensor, *dev_ctx);
+}
+
 void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
                            const platform::DeviceContext &dev_ctx,
                            const size_t &seek,
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index b8911154e6bf7b3b3fbd5946e21401dba4002929..6b357aba1c5f9a4c0db53b20a9d47e64b71d0a11 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -14,16 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <glog/logging.h>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#endif
-
-#include <glog/logging.h>
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/mixed_vector.h"
@@ -260,5 +255,9 @@ LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
 
 LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
 
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor);
+
+void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc
index a9e15ee1758070794e7d933d2e9c093186ac60a5..e347f4ab387b82029a304e0aa2ea59a6c446f189 100644
--- a/paddle/fluid/framework/multi_trainer.cc
+++ b/paddle/fluid/framework/multi_trainer.cc
@@ -38,6 +38,13 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc,
     need_merge_var_names_.push_back(
         trainer_desc.downpour_param().stat_var_names(i));
   }
+#ifdef PADDLE_WITH_HETERPS
+  for (int i = 0; i < thread_num_; ++i) {
+    int num = trainer_desc.worker_places(i);
+    platform::CUDAPlace place = platform::CUDAPlace(num);
+    places_.push_back(place);
+  }
+#endif
   // get filelist from trainer_desc here
   const std::vector<paddle::framework::DataFeed*> readers =
       dataset->GetReaders();
@@ -102,13 +109,42 @@ void MultiTrainer::InitDumpEnv() {
 void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
                                   const platform::Place& place) {
   for (int i = 0; i < thread_num_; ++i) {
+#ifdef PADDLE_WITH_HETERPS
+    workers_[i]->SetPlace(places_[i]);
+    workers_[i]->SetReaderPlace(places_[i]);
+#else
     workers_[i]->SetPlace(place);
     workers_[i]->SetReaderPlace(place);
+#endif
     workers_[i]->SetRootScope(root_scope_);
     workers_[i]->CreateDeviceResource(main_program);  // Program
     workers_[i]->BindingDataFeedMemory();
     workers_[i]->CacheProgram(main_program);
   }
+#ifdef PADDLE_WITH_HETERPS
+  for (int num = 0; num < thread_num_; ++num) {
+    auto place = places_[num];
+    Scope* scope = workers_[num]->GetThreadScope();
+    auto& block = main_program.Block(0);
+    for (auto& var : block.AllVars()) {
+      if (var->Persistable()) {
+        auto name = var->Name();
+        Variable* root_var = root_scope_->FindVar(name);
+        if (!root_var) {
+          continue;
+        }
+        if (root_var->IsType<SelectedRows>()) {
+          continue;
+        }
+        LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+        auto* ptr = scope->Var(name);
+        InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
+        LoDTensor* thread_tensor = ptr->GetMutable<LoDTensor>();
+        TensorCopy(*root_tensor, place, thread_tensor);
+      }
+    }
+  }
+#endif
 }
 
 void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) {
@@ -139,10 +175,79 @@ void MultiTrainer::Run() {
   }
 }
 
+#ifdef PADDLE_WITH_HETERPS
+void MultiTrainer::MergeDenseParam() {
+#ifdef PADDLE_WTIH_PSCORE
+  auto communicator = paddle::distributed::Communicator::GetInstance();
+  auto& recv_ctx = communicator->GetRecvCtxMap();
+  Scope* thread_scope = workers_[0]->GetThreadScope();
+  for (auto& iter : recv_ctx) {
+    auto& varnames = iter.second;
+    for (auto& name : varnames) {
+      Variable* root_var = root_scope_->FindVar(name);
+      LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+      Variable* var = thread_scope->FindVar(name);
+      LoDTensor* tensor = var->GetMutable<LoDTensor>();
+      TensorCopy((*tensor), root_tensor->place(), root_tensor);
+    }
+  }
+#endif
+}
+#endif
+
+template <typename T>
+void MultiTrainer::MergeToRootScope(LoDTensor* root_tensor, LoDTensor* tensor) {
+  LoDTensor tmp_root;
+  TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root);
+  T* tmp_root_data = tmp_root.data<T>();
+  LoDTensor tmp_tensor;
+  TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor);
+  T* data = tmp_tensor.data<T>();
+  for (int i = 0; i < tmp_tensor.numel(); i++) {
+    tmp_root_data[i] += data[i];
+  }
+  TensorCopy(tmp_root, platform::CPUPlace(), root_tensor);
+}
+
 void MultiTrainer::Finalize() {
   if (need_dump_field_ || need_dump_param_) {
     FinalizeDumpEnv();
   }
+#ifdef PADDLE_WITH_HETERPS
+  for (size_t i = 0; i < need_merge_var_names_.size(); i++) {
+    Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]);
+    if (root_var == nullptr) {
+      continue;
+    }
+    LoDTensor* root_tensor = root_var->GetMutable<LoDTensor>();
+
+    for (size_t j = 0; j < places_.size(); j++) {
+      Scope* cur_thread_scope = workers_[j]->GetThreadScope();
+      Variable* thread_var =
+          cur_thread_scope->FindVar(need_merge_var_names_[i]);
+      if (thread_var == nullptr) {
+        continue;
+      }
+      LoDTensor* thread_tensor = thread_var->GetMutable<LoDTensor>();
+#define MergeCallback(cpp_type, proto_type)                                    \
+  do {                                                                         \
+    if (root_tensor->type() == proto_type) {                                   \
+      if (thread_tensor->type() != proto_type) {                               \
+        VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \
+                << "] " << need_merge_var_names_[i]                            \
+                << ", root tensor type=" << root_tensor->type()                \
+                << ", thread tensor type=" << thread_tensor->type();           \
+        exit(-1);                                                              \
+      }                                                                        \
+      MergeToRootScope<cpp_type>(root_tensor, thread_tensor);                  \
+    }                                                                          \
+  } while (0)
+      _ForEachDataType_(MergeCallback);
+    }
+  }
+  MergeDenseParam();
+
+#endif
   root_scope_->DropKids();
 }
 
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index f107321958ba7be4d3ba31bd128f0cbbad694b85..7d55d8c41e3e92349dc9986b3d236db2ebdac01b 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -128,7 +128,7 @@ NaiveExecutor::~NaiveExecutor() {
 #ifdef PADDLE_WITH_MKLDNN
   // Clear mkl-dnn cache,
   // this is needed to have mkl-dnn unit tests working
-  ClearMKLDNNCache(place_);
+  ClearMKLDNNCache(place_, this);
 #endif
 }
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 7af5c54ceed74fc334dd10438cc5b0b62c06042d..519bf8c633a013fedab4f529dad014a71ad2d594 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -447,6 +447,11 @@ void OpDesc::SetOutput(const std::string &param_name,
   this->outputs_[param_name] = args;
 }
 
+void OpDesc::RemoveOutput(const std::string &name) {
+  outputs_.erase(name);
+  need_update_ = true;
+}
+
 bool OpDesc::HasProtoAttr(const std::string &name) const {
   auto &op_info = OpInfoMap::Instance();
   if (op_info.Has(desc_.type())) {
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 95c33bca6c7f1df6ad71a3b4c2f82d726cafb5fc..1bc1a308e453bb816b00d6ca9a62358f8d33082a 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -65,6 +65,7 @@ class OpDesc {
 
   void SetOutput(const std::string &param_name,
                  const std::vector<std::string> &args);
+  void RemoveOutput(const std::string &name);
 
   bool HasAttr(const std::string &name) const {
     return attrs_.find(name) != attrs_.end();
diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h
index af657232e91a68aa26ab85faf63acdd1b8f191d1..ddd84bfd81abf53de3ad534cac44374dde631195 100644
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 912e82f60ef5df81d163e1d8b937eb026e51478a..506c3eb1e0ad09658e431f03ac8231b4c796ec83 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <string>
 #include "glog/logging.h"
 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 472c6f408266af6b47b7fdad2d1c9b3be6ee8cf5..593d4d839fa910d2ef81b3ae7483cee4399926cb 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -25,7 +25,8 @@ limitations under the License. */
 #include <unordered_set>
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "glog/logging.h"               // For VLOG()
+#include "gflags/gflags.h"
+#include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
@@ -67,6 +68,8 @@ class Version;
 }  // namespace framework
 }  // namespace paddle
 
+DECLARE_bool(check_kernel_launch);
+
 namespace paddle {
 namespace framework {
 
@@ -134,6 +137,19 @@ class OpRegistry {
   static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 };
 
+template <typename PlaceType>
+inline void CheckKernelLaunch(const char* op_type) {}
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+inline void CheckKernelLaunch<::paddle::platform::CUDAPlace>(
+    const char* op_type) {
+  if (FLAGS_check_kernel_launch) {
+    PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(op_type);
+  }
+}
+#endif
+
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor;
 
@@ -162,8 +178,9 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
     RegisterKernelClass<PlaceType, T>(
         op_type, library_type, customized_type_value,
 
-        [](const framework::ExecutionContext& ctx) {
+        [op_type](const framework::ExecutionContext& ctx) {
           KERNEL_TYPE().Compute(ctx);
+          CheckKernelLaunch<PlaceType>(op_type);
         });
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
@@ -223,8 +240,13 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 
   void operator()(const char* op_type, const char* library_type,
                   int customized_type_value) const {
-    RegisterKernelClass<PlaceType, T>(op_type, library_type,
-                                      customized_type_value, Functor());
+    RegisterKernelClass<PlaceType, T>(
+        op_type, library_type, customized_type_value,
+
+        [op_type](const framework::ExecutionContext& ctx) {
+          Functor()(ctx);
+          CheckKernelLaunch<PlaceType>(op_type);
+        });
 
     constexpr auto size =
         std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
@@ -304,6 +326,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define REGISTER_OP_XPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
 
+#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
+
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
                               customized_name,                     \
                               customized_type_value,               \
@@ -340,6 +365,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
+#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index b9ec5507612099d211ad059126c15eb000881481..5ae8f255d63be56a06242607741c5b0bf2bf0679 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_version_proto.h"
 #include "paddle/fluid/platform/enforce.h"
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 833a28a7579ca09279147c346abf5fffdc7f3324..955c917b2c1bf4119e13b5d8cdb813b036fbf587 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -47,9 +47,6 @@ DECLARE_bool(benchmark);
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
 DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
-DEFINE_bool(fast_check_nan_inf, false,
-            "Fast checking NAN/INF after each operation. It will be a little"
-            "bit slow, much faster than check_nan_inf");
 
 namespace paddle {
 namespace framework {
@@ -211,6 +208,16 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 #else
       auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
       platform::SetXPUDeviceId(dev_id);
+#endif
+    } else if (platform::is_npu_place(place)) {
+#ifndef PADDLE_WITH_ASCEND_CL
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Cannot run operator on place %s, please recompile paddle or "
+          "reinstall Paddle with NPU support.",
+          place));
+#else
+      auto dev_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+      platform::SetNPUDeviceId(dev_id);
 #endif
     }
 
@@ -1173,25 +1180,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #endif
   }
 
-  if (FLAGS_fast_check_nan_inf) {
-    for (auto& vname : OutputVars(true)) {
-      // only check inserted vars,
-      // please see executor.py for details of fast_check_nan_inf
-      if (vname.rfind("debug_var") == 0) {
-        VLOG(3) << "debugging nan/inf in var " << vname;
-
-        auto* var = exec_scope.FindVar(vname);
-        if (var == nullptr) continue;
-        if (var->IsType<framework::LoDTensor>()) {
-          CheckTensorNANOrInf(type_, vname, var->Get<framework::LoDTensor>());
-        } else if (var->IsType<framework::SelectedRows>()) {
-          CheckTensorNANOrInf(type_, vname,
-                              var->Get<framework::SelectedRows>().value());
-        }
-      }
-    }
-  }
-
   if (FLAGS_check_nan_inf) {
     framework::details::CheckOpHasNanOrInf(*this, exec_scope, place);
   }
@@ -1248,7 +1236,8 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
       }
     }
   }
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  VLOG(3) << "op type:" << type_
+          << ", expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
@@ -1270,6 +1259,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
     expected_kernel_key.place_ = platform::CPUPlace();
     kernel_iter = kernels.find(expected_kernel_key);
   }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (kernel_iter == kernels.end() &&
+      is_npu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing NPU kernel: " << type_
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
   PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                     platform::errors::NotFound(
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e9ecf9b5a8397880fb878babfe329701023984b1..3fc61581eca720f64d4b19fd70b9b619cea9fcef 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
@@ -420,6 +419,7 @@ class ExecutionContext {
   const RuntimeContext Context() const { return ctx_; }
 
   std::string DebugString() const { return op_.DebugString(); }
+  const OperatorBase& GetOp() const { return op_; }
 
  private:
   const OperatorBase& op_;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 2f280d5cc4ae0b3056df553b19f47979c835f71e..eb021609e825839825b657ef516a18c5b4cbcc74 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -625,144 +624,21 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                                    const BuildStrategy &build_strategy,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate(places, scope)) {
+  PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]),
+                 platform::errors::Unavailable(
+                     "NPU is not supported in ParallelExecutor"));
   InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
-  member_->use_device_ = exec_strategy.use_device_;
-  member_->build_strategy_ = build_strategy;
-  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
-                             BuildStrategy::ReduceStrategy::kAllReduce;
-  member_->nranks_ = build_strategy.num_trainers_ * places.size();
-  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
-    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
-                 "the number of places should be greater than 1.";
-    member_->build_strategy_.reduce_ =
-        BuildStrategy::ReduceStrategy::kAllReduce;
-    member_->use_all_reduce_ = true;
-  }
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        places.size(), 1,
-        platform::errors::Unavailable("Windows can support Single GPU only."));
-  }
-#endif
-
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        places.size(), 1,
-        platform::errors::PermissionDenied(
-            "Your machine has multiple cards, "
-            "but the WITH_NCCL option is not turned on during compilation, "
-            "and you cannot use multi-card training or prediction. "
-            "Please recompile and turn on the WITH_NCCL option."));
-  }
-#endif
-
-  std::string device_name;
-  if (member_->use_device_ == p::kCPU) {
-    device_name = "CPU";
-  } else if (member_->use_device_ == p::kCUDA) {
-    device_name = "CUDA";
-  } else {
-    device_name = "XPU";
-  }
-
-  VLOG(1) << string::Sprintf(
-      "The Program will be executed on %s using ParallelExecutor, %lu "
-      "cards are used, so %lu programs are executed in parallel.",
-      device_name, places.size(), places.size());
-
-  // Step 1. Bcast the bcast_vars to devs.
-  // Create local scopes
-  if (local_scopes.empty()) {
-    member_->own_local_scope_ = true;
-    member_->local_scopes_.emplace_back(member_->global_scope_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&scope->NewScope());
-    }
-  } else {
-    member_->own_local_scope_ = false;
-    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(),
-                      platform::errors::PreconditionNotMet(
-                          "member_->places_.size() = %d is not equal to "
-                          "local_scopes.size() = %d",
-                          member_->places_.size(), local_scopes.size()));
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
-    }
-  }
-
-  std::vector<ir::Graph *> graphs;
-  if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
-                      platform::errors::Unavailable(
-                          "gpu mode does not support async_mode_ now!"));
-    graphs.push_back(graph);
-    for (size_t i = 1; i < places.size(); ++i) {
-      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
-      async_graphs_.emplace_back(tmp_graph);
-      graphs.push_back(tmp_graph);
-    }
-  }
-
-  // FIXME(Yancey1989): parallel graph mode get better performance
-  // in GPU allreduce distributed training. Need an elegant way to
-  // choice the execution strategy.
-  member_->build_strategy_.enable_parallel_graph_ =
-      EnableParallelGraphExecution(*graph, exec_strategy,
-                                   member_->build_strategy_);
-  if (member_->build_strategy_.enable_parallel_graph_) {
-    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
-                 "Execution which can get better performance,"
-              << "you can force it off by env FLAGS_enable_parallel_graph=0";
-  }
+  // Initialize necessary info of member_ with strategy.
+  InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, places.size(),
+                                *graph);
 
-  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
-
-    // Initialize device context's nccl comm, will be used by normal
-    // Operators like sync_batch_norm, and collective ops.
-    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
-    // be rewrite and there will be some problem.
-    // NOTE: NCCL group-calls and non-group-calls can not use the same
-    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
-    // same communicators.
-    auto *nccl_ctxs =
-        member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_nccl_comm(nccl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
-#endif
-  }
-  if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_XPU_BKCL)
-    member_->InitOrGetBKCLCommunicator(scope, member_->build_strategy_);
+  // Step 1. Create local scopes and Clone graph into multi device
+  CreateLocalScopes(scope, local_scopes, /*create_new*/ true);
+  std::vector<ir::Graph *> graphs = CloneGraphToMultiDevices(graph);
+  PrepareNCCLCommunicator(scope);
 
-    auto *bkcl_ctxs =
-        member_->bkcl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with XPU."));
-#endif
-  }
   // broadcast parameters from the 0th device to others:
   auto need_broadcast = [&]() -> bool {
     if (member_->build_strategy_.num_trainers_ > 1) {
@@ -775,257 +651,75 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     }
     return false;
   };
-  // Bcast Parameters to all GPUs
   if (need_broadcast()) {
     BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_);
   }
 
-  // Startup Program has been run. All local scopes has correct parameters.
-
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-  std::vector<ir::Graph *> async_graphs(places.size());
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->nccl_ctxs_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->nccl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->nccl_ctxs_);
-  }
-#elif defined(PADDLE_WITH_XPU_BKCL)
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->bkcl_ctxs_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->bkcl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->bkcl_ctxs_);
-  }
-#else
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_);
-  }
-#endif
-
+  std::vector<ir::Graph *> async_graphs =
+      CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name);
   graph = member_->ApplyMemoryOptimizePass(graph);
-
   async_graphs[0] = graph;
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos.emplace_back();
-      var_infos.back().name_ = node->Var()->Name();
-      var_infos.back().type_ = node->Var()->GetType();
-      var_infos.back().persistable_ = node->Var()->Persistable();
-
-      member_->is_persistable_.emplace(node->Var()->Name(),
-                                       node->Var()->Persistable());
-    }
-  }
-
-  if (graph->Has(details::kFusedVars)) {
-    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
-    for (auto &fused_var : fused_vars) {
-      var_infos.emplace_back();
-      var_infos.back() = fused_var.second;
+  CreateVariableInfos(&var_infos, graph);
+  std::unordered_map<Scope *, Scope *> scope_map =
+      CreateLocalExecScopes(member_->local_scopes_, /*create_new*/ true);
 
-      member_->is_persistable_.emplace(fused_var.first,
-                                       fused_var.second.persistable_);
-    }
-  }
+  // Step 4. Create SSAGraph executor
+  std::vector<ir::Graph *> final_graphs =
+      CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph);
 
-  std::unordered_map<Scope *, Scope *> scope_map;
-  for (auto *scope : member_->local_scopes_) {
-    auto &local_exec_scope = scope->NewScope();
-    member_->local_exec_scopes_.emplace_back(&local_exec_scope);
-    scope_map.emplace(scope, &local_exec_scope);
+  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
+  if (!member_->build_strategy_.async_mode_) {
+    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        std::move(var_infos), member_->places_, std::move(member_->executor_)));
   }
 
-  PADDLE_ENFORCE_EQ(
-      member_->local_scopes_.size(), member_->local_exec_scopes_.size(),
-      platform::errors::PreconditionNotMet(
-          "member_->local_scopes_.size() = %d is not equal to "
-          "member_->local_exec_scopes_.size() = %d",
-          member_->local_scopes_.size(), member_->local_exec_scopes_.size()));
+  ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map);
+  SetReaderOpDeviceInfoOfGraphs(final_graphs);
+}
 
-  std::vector<ir::Graph *> final_graphs;
+void ParallelExecutor::BCastParamsToDevices(
+    const std::vector<std::string> &vars, int trainer_id) const {
+  VLOG(3) << "BCastParamsToDevices";
+  // the initializing bcast, all vars would be bcast from device(0).
+  for (auto &var : vars) {
+    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
+    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
+      continue;
+    }
 
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use AsyncSSAGraphExecutor";
-    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, async_graphs));
-    final_graphs = async_graphs;
-  } else if (member_->build_strategy_.enable_parallel_graph_) {
-    VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    // TODO(Yancey1989): Remove passing in the main_program when
-    // allreduce_seq_pass doesn't need it as the attr.
-    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+    auto &main_tensor = main_var->Get<LoDTensor>();
+    if (!main_tensor.IsInitialized()) {
+      VLOG(3) << "one in var not inited, return!";
+      continue;
+    }
+    auto &dims = main_tensor.dims();
+    if (paddle::platform::is_gpu_place(main_tensor.place())) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+      std::vector<void *> buffers;
+      buffers.reserve(member_->places_.size());
+      size_t numel = main_tensor.numel();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
+        void *buffer;
 
-    auto *pg_exe = new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, graph);
-    final_graphs = pg_exe->Graphs();
-    member_->executor_.reset(pg_exe);
-
-    if (is_inference && member_->places_.size() > 1) {
-      member_->inference_executor_ = pg_exe;
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Paddle should be compiled with CUDA for ParallelGraph Execution."));
-#endif
-  } else {
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
-    auto possible_inference_graphs =
-        details::TrySeparateToMultipleSingleDeviceGraphs(graph);
-    if (!possible_inference_graphs.empty()) {
-      VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
-      auto *pg_exe = new details::ParallelSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-          member_->places_, std::move(possible_inference_graphs));
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-      final_graphs = pg_exe->Graphs();
-      member_->executor_.reset(pg_exe);
-      member_->inference_executor_ = pg_exe;
-    } else {
-      LOG_IF(WARNING, details::HasKeepLastReadOp(*graph))
-          << "drop_last=False for DataLoader is not supported in training "
-             "network. It is automatically turned to drop_last=True.";
-      if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
-        VLOG(3) << "use ThreadedSSAGraphExecutor";
-        member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-            member_->places_, graph));
-      } else {
-        if (member_->use_device_ == p::kXPU) {
-#if defined(PADDLE_WITH_XPU)
-          VLOG(3) << "use BindThreadedSSAGraphExecutor";
-          member_->executor_.reset(new details::BindThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
-#else
-          PADDLE_THROW(platform::errors::PermissionDenied(
-              "Paddle can't use XPU device since it's not compiled with XPU,"
-              "Please recompile or reinstall Paddle with XPU support."));
-#endif
-        } else {
-          VLOG(3) << "use FastThreadedSSAGraphExecutor";
-          member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
-        }
-      }
-      final_graphs.emplace_back(graph);
-    }
-  }
-
-  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
-  if (!member_->build_strategy_.async_mode_) {
-    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        std::move(var_infos), member_->places_, std::move(member_->executor_)));
-  }
-
-  for (auto *g : final_graphs) {
-    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
-    for (auto *op : ops) {
-      op->SetLocalExecScopes(scope_map);
-    }
-  }
-
-  if (final_graphs.size() == 1) {
-    ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size());
-  } else {
-    for (size_t i = 0; i < final_graphs.size(); ++i) {
-      ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i);
-    }
-  }
-}
-
-void ParallelExecutor::BCastParamsToDevices(
-    const std::vector<std::string> &vars, int trainer_id) const {
-  VLOG(3) << "BCastParamsToDevices";
-  // the initializing bcast, all vars would be bcast from device(0).
-  for (auto &var : vars) {
-    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
-    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
-      continue;
-    }
-
-    auto &main_tensor = main_var->Get<LoDTensor>();
-    if (!main_tensor.IsInitialized()) {
-      VLOG(3) << "one in var not inited, return!";
-      continue;
-    }
-    auto &dims = main_tensor.dims();
-    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      std::vector<void *> buffers;
-      buffers.reserve(member_->places_.size());
-      size_t numel = main_tensor.numel();
-      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        auto place = member_->places_[i];
-        void *buffer;
-
-        if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
-        } else {
-          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-          t->Resize(dims);
-          buffer = t->mutable_data(place, main_tensor.type());
-        }
-        buffers.push_back(buffer);
-      }
+        if (i == 0 && trainer_id == 0) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
+          auto local_scope = member_->local_scopes_[i];
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+          t->Resize(dims);
+          buffer = t->mutable_data(place, main_tensor.type());
+        }
+        buffers.push_back(buffer);
+      }
 
       PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
                         platform::errors::PreconditionNotMet(
@@ -1364,6 +1058,412 @@ bool ParallelExecutor::EnableParallelGraphExecution(
   return enable_parallel_graph;
 }
 
+void ParallelExecutor::InitExecutorPrivateMemberInfo(
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    size_t device_count, const ir::Graph &graph) {
+  member_->use_device_ = exec_strategy.use_device_;
+  member_->build_strategy_ = build_strategy;
+  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
+                             BuildStrategy::ReduceStrategy::kAllReduce;
+  member_->nranks_ = build_strategy.num_trainers_ * device_count;
+  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
+    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
+                 "the number of places should be greater than 1.";
+    member_->build_strategy_.reduce_ =
+        BuildStrategy::ReduceStrategy::kAllReduce;
+    member_->use_all_reduce_ = true;
+  }
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
+  if (member_->IsUseCUDA(member_->use_device_)) {
+    PADDLE_ENFORCE_EQ(
+        device_count, 1,
+        platform::errors::Unavailable("Windows can support Single GPU only."));
+  }
+#endif
+
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
+  if (member_->IsUseCUDA(member_->use_device_)) {
+    PADDLE_ENFORCE_EQ(
+        device_count, 1,
+        platform::errors::PermissionDenied(
+            "Your machine has multiple cards, "
+            "but the WITH_NCCL option is not turned on during compilation, "
+            "and you cannot use multi-card training or prediction. "
+            "Please recompile and turn on the WITH_NCCL option."));
+  }
+#endif
+
+  std::string device_name;
+  if (member_->use_device_ == p::kCPU) {
+    device_name = "CPU";
+  } else if (member_->use_device_ == p::kCUDA) {
+    device_name = "CUDA";
+  } else {
+    device_name = "XPU";
+  }
+
+  VLOG(1) << string::Sprintf(
+      "The Program will be executed on %s using ParallelExecutor, %lu "
+      "cards are used, so %lu programs are executed in parallel.",
+      device_name, device_count, device_count);
+
+  // FIXME(Yancey1989): parallel graph mode get better performance
+  // in GPU allreduce distributed training. Need an elegant way to
+  // choice the execution strategy.
+  member_->build_strategy_.enable_parallel_graph_ =
+      EnableParallelGraphExecution(graph, exec_strategy,
+                                   member_->build_strategy_);
+  if (member_->build_strategy_.enable_parallel_graph_) {
+    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
+                 "Execution which can get better performance,"
+              << "you can force it off by env FLAGS_enable_parallel_graph=0";
+  }
+}
+
+void ParallelExecutor::CreateLocalScopes(
+    Scope *global_scope, const std::vector<Scope *> &local_scopes,
+    bool create_new) {
+  if (local_scopes.empty()) {
+    member_->own_local_scope_ = true;
+    member_->local_scopes_.emplace_back(global_scope);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&global_scope->NewScope());
+    }
+  } else {
+    member_->own_local_scope_ = false;
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(),
+                      platform::errors::PreconditionNotMet(
+                          "member_->places_.size() = %d is not equal to "
+                          "local_scopes.size() = %d",
+                          member_->places_.size(), local_scopes.size()));
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      if (create_new) {
+        member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
+      } else {
+        // Use local scopes directly
+        member_->local_scopes_.emplace_back(local_scopes[i]);
+      }
+    }
+  }
+}
+
+std::unordered_map<Scope *, Scope *> ParallelExecutor::CreateLocalExecScopes(
+    const std::vector<Scope *> &local_scopes, bool create_new) {
+  std::unordered_map<Scope *, Scope *> scope_map;
+
+  for (auto *scope : local_scopes) {
+    Scope *local_exec_scope = scope;
+    if (create_new) {
+      local_exec_scope = &scope->NewScope();
+    }
+    member_->local_exec_scopes_.emplace_back(local_exec_scope);
+    scope_map.emplace(scope, local_exec_scope);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      member_->local_scopes_.size(), member_->local_exec_scopes_.size(),
+      platform::errors::PreconditionNotMet(
+          "member_->local_scopes_.size() = %d is not equal to "
+          "member_->local_exec_scopes_.size() = %d",
+          member_->local_scopes_.size(), member_->local_exec_scopes_.size()));
+
+  return scope_map;
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CloneGraphToMultiDevices(
+    ir::Graph *graph) {
+  std::vector<ir::Graph *> graphs;
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
+                      platform::errors::Unavailable(
+                          "gpu mode does not support async_mode_ now!"));
+    graphs.push_back(graph);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
+      async_graphs_.emplace_back(tmp_graph);
+      graphs.push_back(tmp_graph);
+    }
+  }
+
+  return graphs;
+}
+
+void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
+  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_);
+
+    // Initialize device context's nccl comm, will be used by normal
+    // Operators like sync_batch_norm, and collective ops.
+    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
+    // be rewrite and there will be some problem.
+    // NOTE: NCCL group-calls and non-group-calls can not use the same
+    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
+    // same communicators.
+    auto *nccl_ctxs = member_->nccl_ctxs_->GetSyncBatchNormCtx(
+        global_scope, member_->places_);
+    auto &pool = platform::DeviceContextPool::Instance();
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
+      dev_ctx->set_nccl_comm(nccl_ctx.comm());
+    }
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
+#endif
+  }
+  if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    member_->InitOrGetBKCLCommunicator(global_scope, member_->build_strategy_);
+
+    auto *bkcl_ctxs = member_->bkcl_ctxs_->GetSyncBatchNormCtx(
+        global_scope, member_->places_);
+    auto &pool = platform::DeviceContextPool::Instance();
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
+      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+    }
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with XPU."));
+#endif
+  }
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
+    ir::Graph *graph, std::vector<ir::Graph *> *device_graphs,
+    const std::string &loss_var_name) {
+  auto device_count = member_->places_.size();
+  std::vector<ir::Graph *> async_graphs(device_count);
+
+  auto &graphs = *device_graphs;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+                      platform::errors::PreconditionNotMet(
+                          "graphs.size() shoule be %d, but received %d",
+                          device_count, graphs.size()));
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_,
+        member_->nccl_ctxs_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_,
+          member_->nccl_ctxs_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_, member_->nccl_ctxs_);
+  }
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+                      platform::errors::PreconditionNotMet(
+                          "graphs.size() shoule be %d, but received %d",
+                          device_count, graphs.size()));
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_,
+        member_->bkcl_ctxs_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_,
+          member_->bkcl_ctxs_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_, member_->bkcl_ctxs_);
+  }
+#else
+  if (member_->build_strategy_.async_mode_) {
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_);
+  }
+#endif
+
+  return async_graphs;
+}
+
+void ParallelExecutor::CreateVariableInfos(
+    std::vector<details::VariableInfo> *var_infos, ir::Graph *graph) {
+  PADDLE_ENFORCE_EQ(
+      var_infos->size(), 0,
+      platform::errors::PreconditionNotMet(
+          "var_infos->size() shoule be 0, but received %d", var_infos->size()));
+  PADDLE_ENFORCE_EQ(
+      member_->is_persistable_.size(), 0,
+      platform::errors::PreconditionNotMet(
+          "member_->is_persistable_.size() shoule be 0, but received %d",
+          member_->is_persistable_.size()));
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos->emplace_back();
+      var_infos->back().name_ = node->Var()->Name();
+      var_infos->back().type_ = node->Var()->GetType();
+      var_infos->back().persistable_ = node->Var()->Persistable();
+
+      member_->is_persistable_.emplace(node->Var()->Name(),
+                                       node->Var()->Persistable());
+    }
+  }
+
+  if (graph->Has(details::kFusedVars)) {
+    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
+    for (auto &fused_var : fused_vars) {
+      var_infos->emplace_back();
+      var_infos->back() = fused_var.second;
+
+      member_->is_persistable_.emplace(fused_var.first,
+                                       fused_var.second.persistable_);
+    }
+  }
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
+    const ExecutionStrategy &exec_strategy,
+    std::vector<ir::Graph *> *async_graphs, ir::Graph *graph) {
+  std::vector<ir::Graph *> final_graphs;
+
+  if (member_->build_strategy_.async_mode_) {
+    VLOG(3) << "use AsyncSSAGraphExecutor";
+    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        member_->places_, *async_graphs));
+    final_graphs = *async_graphs;
+  } else if (member_->build_strategy_.enable_parallel_graph_) {
+    VLOG(3) << "use ParallelSSAGraphExecutor";
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    // TODO(Yancey1989): Remove passing in the main_program when
+    // allreduce_seq_pass doesn't need it as the attr.
+    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
+    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+
+    auto *pg_exe = new details::ParallelSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        member_->places_, graph);
+    final_graphs = pg_exe->Graphs();
+    member_->executor_.reset(pg_exe);
+
+    if (is_inference && member_->places_.size() > 1) {
+      member_->inference_executor_ = pg_exe;
+      if (!has_drop_last_read_op) {
+        VLOG(5) << "Enable partial feed support in inference phase";
+        pg_exe->EnablePartialFeedSupport();
+      }
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Paddle should be compiled with CUDA for ParallelGraph Execution."));
+#endif
+  } else {
+    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+    auto possible_inference_graphs =
+        details::TrySeparateToMultipleSingleDeviceGraphs(graph);
+    if (!possible_inference_graphs.empty()) {
+      VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
+      auto *pg_exe = new details::ParallelSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+          member_->places_, std::move(possible_inference_graphs));
+      if (!has_drop_last_read_op) {
+        VLOG(5) << "Enable partial feed support in inference phase";
+        pg_exe->EnablePartialFeedSupport();
+      }
+      final_graphs = pg_exe->Graphs();
+      member_->executor_.reset(pg_exe);
+      member_->inference_executor_ = pg_exe;
+    } else {
+      LOG_IF(WARNING, details::HasKeepLastReadOp(*graph))
+          << "drop_last=False for DataLoader is not supported in training "
+             "network. It is automatically turned to drop_last=True.";
+      if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+        VLOG(3) << "use ThreadedSSAGraphExecutor";
+        member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+            member_->places_, graph));
+      } else {
+        if (member_->use_device_ == p::kXPU) {
+#if defined(PADDLE_WITH_XPU)
+          VLOG(3) << "use BindThreadedSSAGraphExecutor";
+          member_->executor_.reset(new details::BindThreadedSSAGraphExecutor(
+              exec_strategy, member_->local_scopes_,
+              member_->local_exec_scopes_, member_->places_, graph));
+#else
+          PADDLE_THROW(platform::errors::PermissionDenied(
+              "Paddle can't use XPU device since it's not compiled with XPU,"
+              "Please recompile or reinstall Paddle with XPU support."));
+#endif
+        } else {
+          VLOG(3) << "use FastThreadedSSAGraphExecutor";
+          member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+              exec_strategy, member_->local_scopes_,
+              member_->local_exec_scopes_, member_->places_, graph));
+        }
+      }
+      final_graphs.emplace_back(graph);
+    }
+  }
+  return final_graphs;
+}
+
+void ParallelExecutor::ResetOpHandleScopeMapOfGraphs(
+    const std::vector<ir::Graph *> &final_graphs,
+    const std::unordered_map<Scope *, Scope *> &scope_map) {
+  PADDLE_ENFORCE_GE(
+      final_graphs.size(), 1,
+      platform::errors::PreconditionNotMet(
+          "final_graphs shoule contain at least one graph, but received %d",
+          final_graphs.size()));
+
+  PADDLE_ENFORCE_GT(scope_map.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "scope_map shoule contain at least one "
+                        "element, but received %d",
+                        scope_map.size()));
+  for (auto *g : final_graphs) {
+    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
+    for (auto *op : ops) {
+      op->SetLocalExecScopes(scope_map);
+    }
+  }
+}
+
+void ParallelExecutor::SetReaderOpDeviceInfoOfGraphs(
+    const std::vector<ir::Graph *> &final_graphs) {
+  if (final_graphs.size() == 1) {
+    ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size());
+  } else {
+    for (size_t i = 0; i < final_graphs.size(); ++i) {
+      ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i);
+    }
+  }
+}
+
 const ir::Graph &ParallelExecutor::Graph() const {
   return member_->executor_->Graph();
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 47de7dc48f4f2cc7f12fd76be7e2b2b041bb7160..d4d0b534b55f05f1e4145064eb55e7858f25185d 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -41,6 +42,7 @@ namespace framework {
 
 class ParallelExecutorPrivate;
 
+using details::VariableInfo;
 using details::BuildStrategy;
 using details::ExecutionStrategy;
 namespace p = paddle::platform;
@@ -93,6 +95,40 @@ class ParallelExecutor {
                                     const ExecutionStrategy &exec_strategy,
                                     const BuildStrategy &build_strategy) const;
 
+  void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy,
+                                     const BuildStrategy &build_strategy,
+                                     size_t device_count,
+                                     const ir::Graph &graph);
+
+  void CreateLocalScopes(Scope *global_scope,
+                         const std::vector<Scope *> &local_scopes,
+                         bool create_new);
+
+  std::unordered_map<Scope *, Scope *> CreateLocalExecScopes(
+      const std::vector<Scope *> &local_scopes, bool create_new);
+
+  std::vector<ir::Graph *> CloneGraphToMultiDevices(ir::Graph *graph);
+
+  void PrepareNCCLCommunicator(Scope *global_scope);
+
+  std::vector<ir::Graph *> CompileGraphWithBuildStrategy(
+      ir::Graph *graph, std::vector<ir::Graph *> *graphs,
+      const std::string &loss_var_name);
+
+  void CreateVariableInfos(std::vector<VariableInfo> *var_infos,
+                           ir::Graph *graph);
+
+  std::vector<ir::Graph *> CreateSSAGraphExecutor(
+      const ExecutionStrategy &exec_strategy,
+      std::vector<ir::Graph *> *async_graphs, ir::Graph *graph);
+
+  void ResetOpHandleScopeMapOfGraphs(
+      const std::vector<ir::Graph *> &final_graphs,
+      const std::unordered_map<Scope *, Scope *> &scope_map);
+
+  void SetReaderOpDeviceInfoOfGraphs(
+      const std::vector<ir::Graph *> &final_graphs);
+
   ParallelExecutorPrivate *member_;
   std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
 };
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 8d350f70165b667f8ea5e152dce29e3dfac74ec0..3bd50229b94deb0e09f242d529c3225ec2e4408a 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -24,6 +25,9 @@ namespace framework {
 void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
                                  Dataset* dataset) {
   const auto& section_params = trainer_desc.section_param();
+  const int num_pipeline_stages_ = section_params.num_pipeline_stages();
+  const int pipeline_stage_ = section_params.pipeline_stage();
+  const int schedule_mode_ = section_params.schedule_mode();
   num_microbatches_ = section_params.num_microbatches();
   VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_;
   trainer_desc_ = trainer_desc;
@@ -31,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   ParseDumpConfig(trainer_desc);
   const auto& section_config = section_params.section_config();
   int place_id = section_config.place_id();
+#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)
   place_ = platform::CUDAPlace(place_id);
+#elif (defined PADDLE_WITH_ASCEND_CL)  // NOLINT
+  place_ = platform::NPUPlace(place_id);
+#endif
   worker_ = DeviceWorkerFactory::CreateDeviceWorker(
       trainer_desc.device_worker_name());
   auto this_worker =
@@ -39,6 +47,9 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   this_worker->SetPlace(place_);
   this_worker->Initialize(trainer_desc);
   this_worker->SetMicrobatchNum(num_microbatches_);
+  this_worker->SetPipelineStageNum(num_pipeline_stages_);
+  this_worker->SetPipelineStage(pipeline_stage_);
+  this_worker->SetScheduleMode(schedule_mode_);
 }
 
 void PipelineTrainer::InitOtherEnv(const ProgramDesc& main_program) {
@@ -65,35 +76,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id,
                                      const ProgramDesc& program,
                                      const platform::Place& place) {
   auto& global_block = program.Block(0);
-  std::map<std::string, int> param_map;
-  for (auto& var : global_block.AllVars()) {
-    if (var->Persistable()) {
-      param_map[var->Name()] = 1;
-    }
-  }
 
   for (auto& var : global_block.AllVars()) {
-    bool is_param_grad = false;
-    size_t pos = 0;
-    if ((pos = var->Name().find(kGradVarSuffix)) != std::string::npos) {
-      auto prefix_name = var->Name().substr(0, pos);
-      if (param_map.find(prefix_name) != param_map.end()) {
-        is_param_grad = true;
-      }
-    }
     if (var->Persistable() && microbatch_id == 0) {
       auto* ptr = root_scope_->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create persistable var: " << var->Name()
-              << ", which pointer is " << ptr;
-    } else if (is_param_grad && microbatch_id == 0) {
-      auto* ptr = minibatch_scope_->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create grad for persistable var: " << var->Name()
+      VLOG(5) << "Create persistable var: " << var->Name()
               << ", which pointer is " << ptr;
-    } else if (!var->Persistable() && !is_param_grad) {
+    } else if (!var->Persistable()) {
       auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
-      VLOG(3) << "Create variable " << var->Name() << " for microbatch "
+      VLOG(5) << "Create variable " << var->Name() << " for microbatch "
               << microbatch_id << ", which pointer is " << ptr;
       InitializeVariable(ptr, var->GetType());
     }
diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h
index cfef80b8d3777817d767ba201e4dad85cf8bdc9c..4ceb0c5c8248143791238a7b9077e402a7c1b832 100644
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/proto_desc.h"
 #include "paddle/fluid/platform/macros.h"
 
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index e77932fa5f226518f7be4177488d6cc55f2fce06..39bc3f040639bfb9271ed808c8905b4bb5e89a92 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -19,10 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_context.h"
-#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
     (defined PADDLE_WITH_PSLIB)
@@ -64,7 +60,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
   pull_dense_worker_ = PullDenseWorker::GetInstance();
   pull_dense_worker_->Initialize(trainer_desc);
   SetDebug(trainer_desc.debug());
-  fleet_ptr_ = FleetWrapper::GetInstance();
   trainer_desc_ = trainer_desc;
   workers_.resize(place_num);
   for (int i = 0; i < place_num; ++i) {
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index 2597901d91f36bab7e6a1e3553d8c43bb7a686f8..66d8a40dda160752e64eae8775a2045509e575e3 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
-#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
-#include "paddle/fluid/framework/fleet/heter_wrapper.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -130,8 +128,6 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) {
       }
     }
   }
-  // pull_queue_ = paddle::framework::MakeChannel<std::shared_ptr<HeterTask>>();
-  // push_queue_ = paddle::framework::MakeChannel<std::shared_ptr<HeterTask>>();
 }
 
 void PSGPUWorker::SetChannelWriter(ChannelObject<std::string>* queue) {
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index a4207deb7e8113ab07b8b7f9b227e121f3e0f1bc..e7c23eab1fa5fcab8137c12845beeb237d784e16 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -20,7 +20,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 735c86faf082b817ee0a0ff35d127112fe3058b4..00ff50abadd185eb6ac8907d89aaf455dd5a7f16 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 #include <float.h>
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
@@ -22,15 +23,79 @@ class TrainerDesc;
 
 uint64_t SectionWorker::batch_id_(0);
 
-void SectionWorker::Initialize(const TrainerDesc& desc) {
+void SectionWorker::Initialize(const TrainerDesc &desc) {
   dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
   program_.reset(
       new ProgramDesc(desc.section_param().section_config().program_desc()));
-  for (auto& op_desc : program_->Block(0).AllOps()) {
+  for (auto &op_desc : program_->Block(0).AllOps()) {
     ops_.push_back(OpRegistry::CreateOp(*op_desc));
   }
 }
 
+void SectionWorker::RunForward(
+    int micro_id, std::unique_ptr<GarbageCollector> &gc,
+    std::unordered_map<const OperatorBase *, std::vector<std::string>>
+        &unused_vars_) {
+  for (auto &op : ops_) {
+    int op_role = op->Attr<int>(std::string("op_role"));
+    // We run op with op_role = kLRSched only for the first microbatch
+    // to avoid increasing the @LR_DECAY_STEP@ multiple times.
+    bool run_first_mbatch = (op_role == static_cast<int>(OpRole::kForward)) ||
+                            (op_role == (static_cast<int>(OpRole::kForward) |
+                                         static_cast<int>(OpRole::kLoss))) ||
+                            (op_role == static_cast<int>(OpRole::kLRSched));
+    bool run_others = (op_role == static_cast<int>(OpRole::kForward)) ||
+                      (op_role == (static_cast<int>(OpRole::kForward) |
+                                   static_cast<int>(OpRole::kLoss)));
+    if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) {
+      VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch "
+              << micro_id;
+      op->Run(*microbatch_scopes_[micro_id], place_);
+      if (gc) {
+        DeleteUnusedTensors(*microbatch_scopes_[micro_id], op.get(),
+                            unused_vars_, gc.get());
+      }
+    }
+  }
+}
+
+void SectionWorker::RunBackward(
+    int micro_id, std::unique_ptr<GarbageCollector> &gc,
+    std::unordered_map<const OperatorBase *, std::vector<std::string>>
+        &unused_vars_) {
+  for (auto &op : ops_) {
+    int op_role = op->Attr<int>(std::string("op_role"));
+    if ((op_role == static_cast<int>(OpRole::kBackward)) ||
+        (op_role == (static_cast<int>(OpRole::kBackward) |
+                     static_cast<int>(OpRole::kLoss)))) {
+      VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch "
+              << micro_id;
+      op->Run(*microbatch_scopes_[micro_id], place_);
+      if (gc) {
+        DeleteUnusedTensors(*microbatch_scopes_[micro_id], op.get(),
+                            unused_vars_, gc.get());
+      }
+    }
+  }
+}
+
+void SectionWorker::RunUpdate(
+    std::unique_ptr<GarbageCollector> &gc,
+    std::unordered_map<const OperatorBase *, std::vector<std::string>>
+        &unused_vars_) {
+  for (auto &op : ops_) {
+    int op_role = op->Attr<int>(std::string("op_role"));
+    if (op_role == static_cast<int>(OpRole::kOptimize)) {
+      VLOG(3) << "Update: running op " << op->Type();
+      op->Run(*microbatch_scopes_[num_microbatches_ - 1], place_);
+      if (gc) {
+        DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1],
+                            op.get(), unused_vars_, gc.get());
+      }
+    }
+  }
+}
+
 void SectionWorker::TrainFiles() {
   VLOG(5) << "begin section_worker TrainFiles";
 
@@ -48,69 +113,56 @@ void SectionWorker::TrainFiles() {
 #endif
   }
 
-  for (int i = 0; i < num_microbatches_; ++i) {
-    for (auto& op : ops_) {
-      int op_role = op->Attr<int>(std::string("op_role"));
-      // We run op with op_role = kLRSched only for the first microbatch
-      // to avoid increasing the @LR_DECAY_STEP@ multiple times.
-      bool run_first_mbatch = op_role == static_cast<int>(OpRole::kForward) ||
-                              op_role == (static_cast<int>(OpRole::kForward) |
-                                          static_cast<int>(OpRole::kLoss)) ||
-                              op_role == static_cast<int>(OpRole::kLRSched);
-      bool run_others = op_role == static_cast<int>(OpRole::kForward) ||
-                        op_role == (static_cast<int>(OpRole::kForward) |
-                                    static_cast<int>(OpRole::kLoss));
-      if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) {
-        VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch "
-                << i;
-        op->Run(*microbatch_scopes_[i], place_);
-        if (gc) {
-          DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
-                              gc.get());
-        }
-      }
+  if (schedule_mode_ == 0) {
+    // F-then-B scheduler which runs Forward phase for all microbatches,
+    // then runs Backward phase for all microbatches.
+    // step1: run forward
+    for (int i = 0; i < num_microbatches_; ++i) {
+      RunForward(i, gc, unused_vars_);
     }
-#ifdef PADDLE_WITH_RCCL
-    hipDeviceSynchronize();
-#else
-    cudaDeviceSynchronize();
-#endif
-  }
-
-  // backward pass
-  for (int i = 0; i < num_microbatches_; ++i) {
-    for (auto& op : ops_) {
-      int op_role = op->Attr<int>(std::string("op_role"));
-      if (op_role == static_cast<int>(OpRole::kBackward) ||
-          op_role == (static_cast<int>(OpRole::kBackward) |
-                      static_cast<int>(OpRole::kLoss))) {
-        VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch "
-                << i;
-        op->Run(*microbatch_scopes_[i], place_);
-        if (gc) {
-          DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_,
-                              gc.get());
-        }
-      }
+    // step2: run backward
+    for (int i = 0; i < num_microbatches_; ++i) {
+      RunBackward(i, gc, unused_vars_);
+    }
+    // step3: run update
+    RunUpdate(gc, unused_vars_);
+  } else {
+    // 1F1B scheduler, which runs forward phase and backward phase altertively
+    // after startup phase. For a stage, the number of microbatches for
+    // startup is num_pipeline_stages_ - pipeline_stage_ - 1, where
+    // num_pipeline_stages_ is the total number of pipeline stages and
+    // pipeline_stage_ is the pipeline stage of the current device.
+    auto startup_steps = num_pipeline_stages_ - pipeline_stage_ - 1;
+    VLOG(3) << "startup_steps:" << startup_steps
+            << ", num_stages: " << num_pipeline_stages_
+            << ", stage:" << pipeline_stage_;
+    PADDLE_ENFORCE_GT(
+        num_microbatches_, startup_steps,
+        platform::errors::InvalidArgument(
+            "To use pipeline with 1F1B scheduler, please make sure number of "
+            "microbatches (%d) is than startup steps (%d).",
+            num_microbatches_, startup_steps));
+    int fw_step = 0;
+    int bw_step = 0;
+    // startup phase
+    while (fw_step < startup_steps) {
+      RunForward(fw_step, gc, unused_vars_);
+      fw_step += 1;
     }
-#ifdef PADDLE_WITH_RCCL
-    hipDeviceSynchronize();
-#else
-    cudaDeviceSynchronize();
-#endif
-  }
 
-  // update pass
-  for (auto& op : ops_) {
-    int op_role = op->Attr<int>(std::string("op_role"));
-    if (op_role == static_cast<int>(OpRole::kOptimize)) {
-      VLOG(3) << "Update: running op " << op->Type();
-      op->Run(*microbatch_scopes_[0], place_);
-      if (gc) {
-        DeleteUnusedTensors(*microbatch_scopes_[0], op.get(), unused_vars_,
-                            gc.get());
-      }
+    // 1f1b phase
+    while (fw_step < num_microbatches_) {
+      RunForward(fw_step, gc, unused_vars_);
+      fw_step += 1;
+      RunBackward(bw_step, gc, unused_vars_);
+      bw_step += 1;
+    }
+    // backward phase
+    while (bw_step < num_microbatches_) {
+      RunBackward(bw_step, gc, unused_vars_);
+      bw_step += 1;
     }
+    RunUpdate(gc, unused_vars_);
   }
   dev_ctx_->Wait();
   ++batch_id_;
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 4c30c40ad58375fb08f23e2c7bdef27fdaea7384..7e48d0dc5f96203c4bc89f954b82dfa582eddbc9 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -113,6 +113,21 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
   TensorToStream(os, selected_rows.value(), dev_ctx);
 }
 
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  auto place = selected_rows.place();
+  dev_ctx = pool.Get(place);
+  SerializeToStream(os, selected_rows, *dev_ctx);
+}
+
+void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  dev_ctx = pool.Get(platform::CPUPlace());
+  DeserializeFromStream(os, selected_rows, *dev_ctx);
+}
+
 void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx) {
   {
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 48353b43f56cacbb71512b9e743af281b09fc531..e53e3d973c524657a7b579d96d0f51a39ba40f12 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -173,5 +173,9 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
 void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx);
 
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
+
+void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc
index 54f779813063362956130ec9314365f89a234c1e..101463756c0a5143536362c706ae08333673c831 100644
--- a/paddle/fluid/framework/tensor_test.cc
+++ b/paddle/fluid/framework/tensor_test.cc
@@ -125,25 +125,54 @@ TEST(Tensor, MutableData) {
     float* p2 = nullptr;
     // initialization
     p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
     auto p1_holder = src_tensor.Holder();
     EXPECT_NE(p1, nullptr);
     // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
     p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
     auto p2_holder = src_tensor.Holder();
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1_holder.get(), p2_holder.get());
     // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
     p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
     EXPECT_EQ(p1, p2);
     // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
     p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
-                                        platform::CUDAPlace());
+                                        platform::CUDAPlace(0));
+    EXPECT_EQ(p1, p2);
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    framework::Tensor src_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::NPUPlace(0));
+    auto p1_holder = src_tensor.Holder();
+    EXPECT_NE(p1, nullptr);
+    // set src_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 1024}),
+                                        platform::NPUPlace(0));
+    auto p2_holder = src_tensor.Holder();
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1_holder.get(), p2_holder.get());
+    // set src_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::NPUPlace(0));
+    EXPECT_EQ(p1, p2);
+    // set src_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::NPUPlace(0));
     EXPECT_EQ(p1, p2);
   }
 #endif
@@ -179,7 +208,17 @@ TEST(Tensor, ShareDataWith) {
     framework::Tensor src_tensor;
     framework::Tensor dst_tensor;
     src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
-                                 platform::CUDAPlace());
+                                 platform::CUDAPlace(0));
+    dst_tensor.ShareDataWith(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::NPUPlace(0));
     dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
@@ -216,7 +255,34 @@ TEST(Tensor, Slice) {
   {
     framework::Tensor src_tensor;
     src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
-                                    platform::CUDAPlace());
+                                    platform::CUDAPlace(0));
+    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    framework::DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
+            src_tensor.dims(), platform::CUDAPlace(0)));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
+            slice_tensor.dims(), platform::CUDAPlace(0)));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
+                                    platform::NPUPlace(0));
     framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
     framework::DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
@@ -227,12 +293,12 @@ TEST(Tensor, Slice) {
         reinterpret_cast<uintptr_t>(src_tensor.data<double>());
     uintptr_t src_mutable_data_address =
         reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
-            src_tensor.dims(), platform::CUDAPlace()));
+            src_tensor.dims(), platform::NPUPlace(0)));
     uintptr_t slice_data_address =
         reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
     uintptr_t slice_mutable_data_address =
         reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
-            slice_tensor.dims(), platform::CUDAPlace()));
+            slice_tensor.dims(), platform::NPUPlace(0)));
     EXPECT_EQ(src_data_address, src_mutable_data_address);
     EXPECT_EQ(slice_data_address, slice_mutable_data_address);
     EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index c6ac30a369859db9de244990231a307074e973ed..78fd1af09e29458ec84549c55dd99f8c29da29db 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -97,6 +97,42 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  // TODO(zhiqiu): handle different condition like CUDA code below
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 stream);
+  }
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    auto stream =
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream();
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 stream);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
@@ -304,6 +340,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {  /* npu -> cpu*/
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_cpu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {  /* cpu -> npu*/
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else if (platform::is_npu_place(src_place) &&  // NOLINT
+           platform::is_npu_place(dst_place)) {  /* npu -> npu*/
+    if (src_ptr == dst_ptr) {
+      VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
+              << dst_place;
+      return;
+    }
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size,
+                 nullptr);
+  }
+  else {  // NOLINT
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Copy from %s to %s is not supported.", src_place, dst_place));
+  }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
@@ -431,6 +496,13 @@ class AnyVisitor : public boost::static_visitor<bool> {
     return GetResultHelper(out, gpu);
   }
 
+  bool GetResult(const framework::Tensor& out,
+                 const platform::NPUPlace& npu) const {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Not supported on place (%s) ", npu));
+    // return GetResultHelper(out, npu);
+  }
+
   bool GetResult(const framework::Tensor& out,
                  const platform::CPUPlace& cpu) const {
     return *out.data<bool>();
@@ -633,6 +705,10 @@ struct BothFalseVisitor : public boost::static_visitor<> {
 #endif
   }
 
+  void VisitorImpl(const platform::NPUPlace& npu) const {
+    // TODO(zhiqiu)
+  }
+
   void VisitorImpl(const platform::CPUPlace& cpu) const {
     int num = in_.numel();
     const bool* in_ptr = in_.data<bool>();
@@ -746,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "XPUPlace is not supported when not compiled with XPU"));
+#endif
+    } else if (platform::is_npu_place(tensor.place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& npu_dev_ctx =
+          static_cast<const platform::NPUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::NPUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     npu_dev_ctx.stream());
+        npu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPlace is not supported when not compiled with NPU"));
 #endif
     } else {
       os.write(static_cast<const char*>(data_ptr),
@@ -801,9 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
-        platform::is_xpu_place(dev_ctx.GetPlace())) {
+        platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -812,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+        dev_ctx.Wait();
+      }
 #else
       if (platform::is_gpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "CUDAPlace is not supported when not compiled with CUDA"));
-      } else {
+      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "NPUPlace is not supported when not compiled with NPU"));
       }
 #endif
     } else {
@@ -859,9 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
-        platform::is_xpu_place(dev_ctx.GetPlace())) {
+        platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -870,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+        dev_ctx.Wait();
+      }
 #else
       if (platform::is_gpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "CUDAPlace is not supported when not compiled with CUDA"));
-      } else {
+      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "NPUPlace is not supported when not compiled with NPU"));
       }
 #endif
     } else {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 8a127e0ed5929525a96914c70f20411a61ed88e6..22c8e1c1665f121cda6ba33f23cb7fc0749da025 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/dlpack_tensor.h"
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -136,6 +135,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
   }
 #endif
 }
+
 template <typename T>
 void TensorFromVector(const std::vector<T>& src,
                       const platform::DeviceContext& ctx, Tensor* dst) {
@@ -158,6 +158,59 @@ void TensorFromVector(const std::vector<T>& src,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
+  // cudaMemcpyAsync.
+  // cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
+  // aclrtMemcpyAsync is really "async" between cpu <-> npu.
+  // Since vector is on cpu, I think this function should be a "sync" operation,
+  // so pass nullptr as stream to  memory::Copy().
+  else if (platform::is_npu_place(dst_place)) {  // NOLINT
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 src_place, src_ptr, size, nullptr);
+  }
+#endif
+}
+
+// The fully specialized function should be inline to avoid
+// multi-definition.
+template <>
+inline void TensorFromVector(const std::vector<bool>& src,
+                             const platform::DeviceContext& ctx, Tensor* dst) {
+  // vector<bool> has no data() member, use array instead.
+  // See details:
+  // https://stackoverflow.com/questions/46115669/why-does-stdvectorbool-have-no-data/46115714
+  bool* array = new bool[src.size()];
+  for (unsigned int i = 0; i < src.size(); i++) {
+    array[i] = static_cast<bool>(src[i]);
+  }
+
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(array);
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<bool>(dst_place));
+  auto size = src.size() * sizeof(bool);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
+                 src_place, src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(dst_place)) {  // NOLINT
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 src_place, src_ptr, size, nullptr);
+  }
+#endif
+  delete[] array;
 }
 
 template <typename T>
@@ -172,6 +225,23 @@ void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
   memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
 }
 
+template <>
+inline void TensorFromVector(const std::vector<bool>& src, Tensor* dst) {
+  bool* array = new bool[src.size()];
+  for (unsigned int i = 0; i < src.size(); i++) {
+    array[i] = static_cast<bool>(src[i]);
+  }
+  platform::CPUPlace dst_place = platform::CPUPlace();
+  auto src_ptr = static_cast<const void*>(array);
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<bool>(dst_place));
+  auto size = src.size() * sizeof(bool);
+
+  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+  delete[] array;
+}
+
 template <typename T>
 void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
                     std::vector<T>* dst) {
@@ -195,6 +265,52 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
+                 size, nullptr);
+  }
+#endif
+}
+
+template <>
+inline void TensorToVector(const Tensor& src,
+                           const platform::DeviceContext& ctx,
+                           std::vector<bool>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<bool>());
+  auto size = src.numel() * sizeof(bool);
+
+  bool* array = new bool[src.numel()];
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(array);
+
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
+                 size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(src.place())) {  // NOLINT
+    memory::Copy(dst_place, dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
+                 size, nullptr);
+  }
+#endif
+  for (unsigned int i = 0; i < src.numel(); i++) {
+    (*dst)[i] = static_cast<bool>(array[i]);
+  }
+  delete[] array;
 }
 
 template <typename T>
@@ -216,6 +332,32 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
                BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
 }
 
+template <>
+inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<bool>());
+  auto size = src.numel() * sizeof(bool);
+
+  bool* array = new bool[src.numel()];
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(array);
+
+  PADDLE_ENFORCE_EQ(
+      platform::is_cpu_place(src.place()), true,
+      platform::errors::InvalidArgument(
+          "The input tensor should be CPU device, but actually it is in %s.",
+          src.place()));
+
+  memory::Copy(dst_place, dst_ptr,
+               BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size);
+
+  for (unsigned int i = 0; i < src.numel(); i++) {
+    (*dst)[i] = static_cast<bool>(array[i]);
+  }
+  delete[] array;
+}
+
 std::ostream& operator<<(std::ostream& os, const Tensor& t);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index c32efd0a470be201344fa8d7f817792315b7e6ef..8587ee8d1e91969be86ab50d18e70b6a0d034e98 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -242,6 +242,61 @@ TEST(TensorToVector, Tensor) {
 #endif
 }
 
+TEST(TensorToVector, Tensor_bool) {
+  {
+    paddle::framework::Tensor src;
+    bool* src_ptr =
+        src.mutable_data<bool>({3, 3}, paddle::platform::CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = static_cast<bool>(i % 2);
+    }
+
+    paddle::platform::CPUPlace place;
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(src, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<bool> src_vec = {
+        false, true, false, true, false, true, false, true, false,
+    };
+    paddle::framework::Tensor gpu_tensor;
+    paddle::platform::CUDAPlace place;
+    paddle::platform::CUDADeviceContext gpu_ctx(place);
+    paddle::framework::TensorFromVector<bool>(src_vec, gpu_ctx, &gpu_tensor);
+
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(gpu_tensor, gpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  {
+    std::vector<bool> src_vec = {
+        false, true, false, true, false, true, false, true, false,
+    };
+    paddle::framework::Tensor npu_tensor;
+    paddle::platform::NPUPlace place(0);
+    paddle::platform::NPUDeviceContext npu_ctx(place);
+    paddle::framework::TensorFromVector<bool>(src_vec, npu_ctx, &npu_tensor);
+
+    std::vector<bool> dst;
+    paddle::framework::TensorToVector<bool>(npu_tensor, npu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+
 TEST(TensorFromDLPack, Tensor) {
   {
     std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index ca290a50b42fe0bfe37385c15bc54d8c5a305a06..636760029fedc4e3a570f9a63db5d1f84795ab62 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/fleet/heter_context.h"
-#include "paddle/fluid/framework/fleet/heter_wrapper.h"
-#include "paddle/fluid/framework/heter_service.h"
+//#include "paddle/fluid/framework/fleet/heter_wrapper.h"
+#include "paddle/fluid/framework/heter_util.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/reader.h"
@@ -47,6 +47,10 @@ class PullDenseWorker;
 class Scope;
 class VarDesc;
 class DeviceWorker;
+class HeterWrapper;
+class HeterRequest;
+class HeterResponse;
+
 template <class T>
 class ChannelObject;
 
@@ -109,13 +113,22 @@ class MultiTrainer : public TrainerBase {
   virtual Scope* GetWorkerScope(int thread_id);
   virtual std::string GetDumpPath(int tid);
 
+  template <typename T>
+  void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor);
+#ifdef PADDLE_WITH_HETERPS
+
+  void MergeDenseParam();
+#endif
+
  protected:
   int thread_num_;
   std::vector<std::thread> threads_;
   std::vector<DataFeed*> readers_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<std::string> need_merge_var_names_;
-
+#ifdef PADDLE_WITH_HETERPS
+  std::vector<platform::Place> places_;
+#endif
   int mpi_rank_;
   int mpi_size_;
   int dump_file_num_;
@@ -313,7 +326,6 @@ class PSGPUTrainer : public TrainerBase {
   float scale_datanorm_;
   paddle::platform::Place place_;
   ProgramDesc program_;
-  std::shared_ptr<paddle::framework::FleetWrapper> fleet_ptr_;
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
   std::vector<std::shared_ptr<DeviceWorker>> workers_;
   std::vector<platform::Place> places_;
@@ -324,7 +336,8 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 class PipelineTrainer : public TrainerBase {
  public:
   PipelineTrainer() {}
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 70481cf3727012e4cf41d235154eb277d92cc92f..504885ff5ccbce760c0a659aedabef6790de5f1a 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -93,6 +93,9 @@ message SectionWorkerParameter {
   optional int32 start_cpu_core_id = 4 [ default = 1 ];
   repeated string param_need_sync = 5;
   optional int32 num_microbatches = 6;
+  optional int32 num_pipeline_stages = 7 [ default = 1 ];
+  optional int32 pipeline_stage = 8 [ default = 1 ];
+  optional int32 schedule_mode = 9 [ default = 0 ];
 }
 
 message SectionConfig {
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 6b9dbece8974c286a390627f25e4a25ee8bfb8d3..15073b6f78c5b35209c0c38135d067cb660e487e 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -76,7 +76,8 @@ REGISTER_TRAINER_CLASS(HeterBoxTrainer);
     (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(PSGPUTrainer);
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index a2b5a98401e2363bbfe98b375807ba91e7b5a2ae..e43cccfe648165ce962b779cb513effe990d0ab3 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -45,6 +45,17 @@ using Attribute = boost::variant<
 
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
+#ifdef PADDLE_WITH_ASCEND_CL
+using NPUAttribute =
+    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                   std::vector<float>, std::vector<std::string>, bool,
+                   std::vector<bool>, BlockDesc*, int64_t,
+                   std::vector<BlockDesc*>, std::vector<int64_t>,
+                   std::vector<double>, std::vector<std::vector<int64_t>>>;
+
+using NPUAttributeMap = std::unordered_map<std::string, NPUAttribute>;
+#endif
+
 using OpCreator = std::function<OperatorBase*(
     const std::string& /*type*/, const VariableNameMap& /*inputs*/,
     const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index d2adbdd34512b3404550eb48aafd619ce015a028..0f8465ab8948e425ec48d10052643699e3c10ce7 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -53,27 +53,28 @@ static const std::unordered_set<std::string> &GetOpWithUnusedVarAllowSet() {
   // Use pointer here for safe static deinitialization
   static auto *allow_set = new std::unordered_set<std::string>({
       // called once
-      "batch_norm",                      // 0
-      "batch_norm_grad",                 // 0
-      "sync_batch_norm",                 // 0
-      "sync_batch_norm_grad",            // 0
-      "inplace_abn",                     // 0
-      "inplace_abn_grad",                // 0
-      "dgc_momentum",                    // 0
-      "fake_quantize_range_abs_max",     // 0
-      "rmsprop",                         // 0
-      "sequence_conv_grad",              // 0
-      "roi_perspective_transform_grad",  // 0
-      "fill_zeros_like",                 // 1
-      "fill_any_like",                   // 1
-      "nce_grad",                        // 1
-      "precision_recall",                // 1
-      "fusion_seqpool_cvm_concat",       // 2
-      "fused_batch_norm_act",            // 2
-      "fused_batch_norm_act_grad",       // 2
-      "data_norm",                       // 0
-      "data_norm_grad",                  // 0
-      "update_loss_scaling",             // 0
+      "batch_norm",                         // 0
+      "batch_norm_grad",                    // 0
+      "sync_batch_norm",                    // 0
+      "sync_batch_norm_grad",               // 0
+      "inplace_abn",                        // 0
+      "inplace_abn_grad",                   // 0
+      "dgc_momentum",                       // 0
+      "fake_quantize_range_abs_max",        // 0
+      "rmsprop",                            // 0
+      "sequence_conv_grad",                 // 0
+      "roi_perspective_transform_grad",     // 0
+      "fill_zeros_like",                    // 1
+      "fill_any_like",                      // 1
+      "nce_grad",                           // 1
+      "precision_recall",                   // 1
+      "fusion_seqpool_cvm_concat",          // 2
+      "fused_batch_norm_act",               // 2
+      "fused_batch_norm_act_grad",          // 2
+      "data_norm",                          // 0
+      "data_norm_grad",                     // 0
+      "update_loss_scaling",                // 0
+      "fused_embedding_eltwise_layernorm",  // 0
   });
   return *allow_set;
 }
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 8affeda67b3d07d67ceed2b657b285210e1bd076..2e35f9b845ac730b22841df70c5d20b1b6cedd45 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index b0d8f43a90f35f08c7bcc09858842424f53ddbe0..473df85aa0421ea74280029a11ad613f8537d6bd 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -21,7 +21,6 @@
 #include <vector>
 
 #include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
 #ifdef PADDLE_WITH_CUDA
@@ -37,6 +36,11 @@
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -51,6 +55,10 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+class Communicator;
+class HCCLCommunicator;
+#endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -163,6 +171,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
     operators::CudnnRNNCache,
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo,
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId, platform::BKCLCommunicator,
 #endif
diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h
index 6e65bc2c932877e5365f4533631d50afae4465b3..4cdfba29249ccf257dab3bd2f1707cd7037c2e33 100644
--- a/paddle/fluid/framework/variable_helper.h
+++ b/paddle/fluid/framework/variable_helper.h
@@ -15,7 +15,6 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 5cfa2c2278662d6164464450e0993d3fe676382c..df4dcc6b5d91f3b6e18322116edafe7c133062bd 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -4,7 +4,7 @@ cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp)
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index a56458b21398b31dd036cf840e4b837a8dc7eba4..b4154737e0fbc6245617fb0208f6623e4ebb5943 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -26,7 +26,24 @@ class VarBase;
 
 AmpOperators::AmpOperators()
     : allow_ops_(new std::unordered_set<std::string>()),
-      block_ops_(new std::unordered_set<std::string>()) {}
+      block_ops_(new std::unordered_set<std::string>()),
+      unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
+  auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
+  auto fp16_dtype = framework::proto::VarType::FP16;
+  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
+    bool supported = false;
+    for (auto& kernel_type : it->second) {
+      if (platform::is_gpu_place(kernel_type.first.place_) &&
+          kernel_type.first.data_type_ == fp16_dtype) {
+        supported = true;
+      }
+    }
+    if (!supported) {
+      unsupported_fp16_ops_->insert(it->first);
+    }
+  }
+}
+
 AmpOperators::~AmpOperators() {}
 
 AmpOperators& AmpOperators::Instance() {
@@ -44,16 +61,26 @@ AmpOperators::GetMutableBlockOps() {
   return block_ops_;
 }
 
+std::shared_ptr<std::unordered_set<std::string>>
+AmpOperators::GetMutableUnsupportedFp16Ops() {
+  return unsupported_fp16_ops_;
+}
+
 std::ostream& operator<<(std::ostream& os, AmpOperators& ops) {
   os << "allow ops: ";
   auto allow_ops = ops.GetMutableAllowOps();
   std::copy((*allow_ops).begin(), (*allow_ops).end(),
             std::ostream_iterator<std::string>(os, " "));
-  os << "; ";
+  os << "\n";
   os << "block ops: ";
   auto block_ops = ops.GetMutableBlockOps();
   std::copy((*block_ops).begin(), (*block_ops).end(),
             std::ostream_iterator<std::string>(os, " "));
+  os << "\n";
+  os << "unsupported fp16 ops: ";
+  auto unsupported_fp16_ops = ops.GetMutableUnsupportedFp16Ops();
+  std::copy((*unsupported_fp16_ops).begin(), (*unsupported_fp16_ops).end(),
+            std::ostream_iterator<std::string>(os, " "));
   return os;
 }
 
@@ -133,7 +160,8 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
   if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) {
     for (auto& pair : new_ins) {
       // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
-      if ((op_type == "batch_norm" || op_type == "layer_norm") &&
+      if ((op_type == "batch_norm" || op_type == "layer_norm" ||
+           op_type == "sync_batch_norm") &&
           pair.first != "X") {
         continue;
       }
@@ -156,9 +184,16 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
     return new_ins;
   } else {
     auto dst_type = GetPromoteType(ins);
+    // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
+    if (dst_type == framework::proto::VarType::FP16 &&
+        AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(
+            op_type)) {
+      dst_type = framework::proto::VarType::FP32;
+    }
     for (auto& pair : new_ins) {
       // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
-      if ((op_type == "batch_norm" || op_type == "layer_norm") &&
+      if ((op_type == "batch_norm" || op_type == "layer_norm" ||
+           op_type == "sync_batch_norm") &&
           pair.first == "X" && dst_type == framework::proto::VarType::FP32) {
         continue;
       }
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
index 619c6b0baf896f74eb6eae998801697ec4de6fb0..fa76c19688a693e036dd06bb54ad24ddf748a8af 100644
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -40,6 +40,9 @@ class AmpOperators {
 
   std::shared_ptr<std::unordered_set<std::string>> GetMutableBlockOps();
 
+  std::shared_ptr<std::unordered_set<std::string>>
+  GetMutableUnsupportedFp16Ops();
+
  private:
   AmpOperators();  // forbid calling default constructor
 
@@ -50,6 +53,9 @@ class AmpOperators {
   // The set of ops that support fp16 calculation and are considered numerically
   // dangerous and whose effects may also be observed in downstream ops.
   std::shared_ptr<std::unordered_set<std::string>> block_ops_;
+
+  // The set of ops that has no fp16 CUDA kennel.
+  std::shared_ptr<std::unordered_set<std::string>> unsupported_fp16_ops_;
 };
 
 std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 29ba54986801f117074a415084b3e0f10675954b..7bcc3d6c608c947f71ae030cfb17d4a89495939e 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -36,48 +36,73 @@ DECLARE_bool(sort_sum_gradient);
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, bool retain_graph) {
+void BasicEngine::Init(
+    const std::vector<std::shared_ptr<VarBase>>& tensors,
+    const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+    bool retain_graph) {
   retain_graph_ = retain_graph;
-  init_node_ = var->GradVarBase()->GradNode();
-  PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false,
-                    platform::errors::Unavailable(
-                        "%s trying to backward through the same graph a second "
-                        "time, but this graph have already been freed. Please "
-                        "specify Tensor.backward(retain_graph=True) when "
-                        "calling backward at the first time.",
-                        var->Name()));
-
-  if (!retain_graph) {
-    VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
-            << " because of retain_graph=False when calling backward";
-    var->GradVarBase()->SetGraphIsFreed(true);
-    var->GradVarBase()->ClearGradNode();
-  }
 
-  if (init_node_ == nullptr || var->OverridedStopGradient()) {
-    VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
-               "stop_gradient=True: "
-            << var->Name();
-    return;
-  }
+  PADDLE_ENFORCE_EQ(
+      tensors.size(), grad_tensors.size(),
+      platform::errors::Unavailable(
+          "The size of tensors do not equal the size of grad_tensors,"
+          "the size of tensors is %s, but the size of grad_tensors is %s.",
+          tensors.size(), grad_tensors.size()));
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto var = tensors[i];
+    auto grad_tensor = grad_tensors[i];
+
+    auto init_node = var->GradVarBase()->GradNode();
+    PADDLE_ENFORCE_EQ(
+        var->GradVarBase()->GraphIsFreed(), false,
+        platform::errors::Unavailable(
+            "%s trying to backward through the same graph a second "
+            "time, but this graph have already been freed. Please "
+            "specify Tensor.backward(retain_graph=True) when "
+            "calling backward at the first time.",
+            var->Name()));
+
+    if (!retain_graph) {
+      VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
+              << " because of retain_graph=False when calling backward";
+      var->GradVarBase()->SetGraphIsFreed(true);
+      var->GradVarBase()->ClearGradNode();
+    }
 
-  VLOG(3) << "Init first node of backward";
+    if (init_node == nullptr || var->OverridedStopGradient()) {
+      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+                 "stop_gradient=True: "
+              << var->Name();
+      continue;
+    }
 
-  PADDLE_ENFORCE_EQ(
-      var->HasGradVar(), true,
-      platform::errors::NotFound("Grad variable not exist for variable %s",
-                                 var->Name()));
-
-  auto& fwd_var = var->Var().Get<framework::LoDTensor>();
-  auto* grad_var =
-      var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
-  VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
-          << " as stop_gradient false";
-  var->GradVarBase()->InnerSetOverridedStopGradient(false);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
-  grad_var->Resize(fwd_var.dims());
-  grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-  operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+    VLOG(3) << "Init node of backward";
+
+    PADDLE_ENFORCE_EQ(
+        var->HasGradVar(), true,
+        platform::errors::NotFound("Tensor %s has no gradient", var->Name()));
+
+    auto& fwd_var = var->Var().Get<framework::LoDTensor>();
+    auto* grad_var =
+        var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
+    VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
+            << " as stop_gradient false";
+    var->GradVarBase()->InnerSetOverridedStopGradient(false);
+    auto* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(fwd_var.place());
+    if (grad_tensor == nullptr) {
+      grad_var->Resize(fwd_var.dims());
+      grad_var->mutable_data(fwd_var.place(), fwd_var.type());
+      operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+    } else {
+      paddle::framework::TensorCopy(
+          grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
+          *dev_ctx, grad_var);
+    }
+
+    init_nodes_.push_back(init_node);
+  }
 }
 
 void BasicEngine::CheckBackwardInputs(const OpBase& op) {
@@ -141,17 +166,6 @@ void BasicEngine::PrepareGradAccumulators(
                 << var.get()
                 << ") that don't have grad node  with reference count "
                 << accumulator->RefCnt();
-
-        if (var->HasLeafHooks()) {
-          VLOG(3) << "Grad variable wrapper (" << var->Name()
-                  << ") has leaf grad hooks.";
-          PADDLE_ENFORCE_NE(
-              var->HasGradNode(), true,
-              platform::errors::PermissionDenied(
-                  "Only leaf Tensor's gradient can append hook to "
-                  "Gradientaccumulator."));
-          accumulator->SetPostHooks(var->GetLeafHooks());
-        }
       } else {
         // Because Inplace op overwrites the grad_node of the input grad_var. So
         // only the information of grad_pending_node can be used to find the
@@ -235,8 +249,10 @@ void BasicEngine::PrepareDeps() {
   std::queue<GradOpNode*> q;
   std::unordered_set<GradOpNode*> visited;
 
-  q.push(init_node_.get());
-  visited.insert(init_node_.get());
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(init_nodes_[i].get());
+    visited.insert(init_nodes_[i].get());
+  }
 
   while (!q.empty()) {
     auto* cur_node = q.front();
@@ -262,15 +278,41 @@ void BasicEngine::PrepareDeps() {
   }
 }
 
+static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
+    const NameVarMap<VariableWrapper>& bwd_ins, const std::string& op_type) {
+  std::shared_ptr<NameVarMap<VariableWrapper>> tmp_ins_ptr = nullptr;
+  for (const auto& pair : bwd_ins) {
+    for (size_t i = 0; i < pair.second.size(); ++i) {
+      auto& var = pair.second[i];
+      if (var->HasVariableWrapperHook()) {
+        if (tmp_ins_ptr == nullptr) {
+          tmp_ins_ptr = std::make_shared<NameVarMap<VariableWrapper>>(bwd_ins);
+        }
+        VLOG(3) << "Call " << var->GetVariableWrapperHooks().size()
+                << " hooks of " << op_type << "'s input `" << pair.first
+                << "`'s var `" << var->Name() << "`.";
+        auto tmp_var = var;
+        for (const auto& hook_pair : var->GetVariableWrapperHooks()) {
+          tmp_var = (*hook_pair.second)(tmp_var);
+        }
+        (*tmp_ins_ptr)[pair.first][i] = tmp_var;
+      }
+    }
+  }
+  return tmp_ins_ptr;
+}
+
 void BasicEngine::Execute() {
-  if (init_node_ == nullptr) {
+  if (init_nodes_.empty()) {
     return;
   }
 
   PrepareDeps();
   // Start execute Computation graph
   std::queue<std::shared_ptr<GradOpNode>> q;
-  q.push(std::move(init_node_));
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(std::move(init_nodes_[i]));
+  }
 
   size_t op_num = 0;
 
@@ -292,10 +334,15 @@ void BasicEngine::Execute() {
       auto& bwd_ins = cur_op.GetInsMap();
       auto& bwd_outs = cur_op.GetOutsMap();
 
+      /**
+       * [ Why need temporary outputs here? ]
+       *
+       * - construct the temp output map, avoid to disrupt graph
+       * - replace the element in the map by temp var, because a
+       *   var may be coresponding to several grad var in one op
+       */
       NameVarMap<VariableWrapper> tmp_outs(bwd_outs);
-      // 1. construct the temp output map, avoid to disrupt graph
-      // 2. replace the element in the map by temp var, because a
-      // var may be coresponding to several grad var in one op
+
       for (auto& pair : tmp_outs) {
         if (!pair.second.IsGrad()) {
           continue;
@@ -361,7 +408,8 @@ void BasicEngine::Execute() {
             VLOG(10) << "create temporary var of " << var->Name()
                      << " for sum gradient within this graph!";
           } else if (!inplace_grad_name_map.empty() &&
-                     inplace_grad_name_map.count(pair.first)) {
+                     inplace_grad_name_map.count(pair.first) &&
+                     bwd_ins.count(inplace_grad_name_map.at(pair.first))) {
             // When calculate Inplace grad op, create a new output var.
             // If a tmp var has been created, there is no need to create it
             // again.
@@ -408,10 +456,36 @@ void BasicEngine::Execute() {
         }
       }
 
+      /**
+       * [ Why need temporary inputs here? ]
+       *
+       * - Hook execution should not change original input tensor.
+       *   User can register hook for Tensor's gradient, It is expected
+       *   that the hook only affects the gradient of the backward
+       *   propagation, and does not affect the gradient value input
+       *   as the hook.
+       * - use `tmp_ins_ptr`, only copy bwd_ins when the var in bwd_ins
+       *   hold hooks
+       */
+      auto tmp_ins_ptr = CallGradientHooks(bwd_ins, cur_op.Type());
+
       {
         VLOG(3) << "Start to execute grad op " << cur_op.Type();
-        OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
-                    cur_op.place());
+        try {
+          if (tmp_ins_ptr == nullptr) {
+            OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
+                        cur_op.place());
+          } else {
+            OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs,
+                        cur_op.Attrs(), cur_op.place());
+          }
+        } catch (platform::EnforceNotMet& exception) {
+          Clear();
+          throw std::move(exception);
+        } catch (std::exception& ex) {
+          Clear();
+          PADDLE_THROW(platform::errors::External("%s", ex.what()));
+        }
       }
 
       for (auto& pair : inplace_output_grad_var_list_) {
@@ -428,15 +502,14 @@ void BasicEngine::Execute() {
         if (!accumulator->SumGradCompleted()) {
           continue;
         }
-        // 1. Call Hooks for **inner_var_**
+        // 1. Call Hooks for `inner_var_`
+        accumulator->CallGradientHooks();
 
-        // 2. Sum Gradient with Previous Graph
+        // 2. Sum Gradient `inner_var_` to `var_` of Current or Previous Graph
         accumulator->AccumulateGrad();
 
-        // 3. Call backward Hooks for **var_**
-        if (accumulator->HasPostHooks()) {
-          accumulator->CallBackwardPostHooks();
-        }
+        // 3. Call backward Hooks for `var_`
+        accumulator->CallReduceHooks();
       }
 
       need_accu_var_list_.clear();
@@ -470,7 +543,7 @@ void BasicEngine::Execute() {
 }
 
 void BasicEngine::Clear() {
-  init_node_.reset();
+  init_nodes_.clear();
   node_deps_.clear();
   accumulators_.clear();
   accumulators_with_grad_node_.clear();
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index a2ad8b5f8aa61e438e5afdc72681a96ee21c996b..49761a8df0b6b1d8494e72b6ea7b67c0fa15eb6b 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -30,7 +30,9 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, bool retain_graph = false);
+  void Init(const std::vector<std::shared_ptr<VarBase>>& tensors,
+            const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+            bool retain_graph = false);
 
   void Execute() override;
 
@@ -46,7 +48,7 @@ class BasicEngine : public Engine {
   void Clear();
 
  private:
-  std::shared_ptr<GradOpNode> init_node_;
+  std::vector<std::shared_ptr<GradOpNode>> init_nodes_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   // The input and output of Inplace op are the same. If only `var` is used
   // as the key, then the input and output of inplace op must be gradient
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 873068a0d310dc38dab867e73ff4577aae3a6f23..16f9454e9376e4368a478cf8adf9e3f988868785 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -19,12 +19,11 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/bkcl_helper.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-
-#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/fluid/string/string_helper.h"
@@ -77,7 +76,7 @@ void BKCLParallelContext::Init() {
   bkcl_ids.resize(strategy_.nrings_);
 
   if (strategy_.local_rank_ == 0) {
-    // generate the unique ncclid on the root worker
+    // generate the unique bkclid on the root worker
     for (size_t i = 0; i < bkcl_ids.size(); ++i) {
       auto ret = bkcl_get_unique_id(&bkcl_ids[i]);
       PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret,
@@ -99,6 +98,28 @@ void BKCLParallelContext::Init() {
   }
 }
 
+void BKCLParallelContext::InitWithRingID(int ring_id) {
+  std::vector<BKCLUniqueId> bkcl_ids;
+  bkcl_ids.resize(1);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique bkclid on the root worker
+    auto ret = bkcl_get_unique_id(&bkcl_ids[0]);
+    PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret,
+                      platform::errors::PreconditionNotMet(
+                          "BKCL get unique id failed [%d]", ret));
+  }
+  BcastBKCLId(bkcl_ids, 0);
+
+  int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
+  VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
+          << " ring id: " << ring_id;
+  // it will assign bkcl_comm in XPUDeviceContext within ring_id
+  platform::BKCLCommContext::Instance().CreateBKCLComm(
+      &bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id);
+}
+
 void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
                                             framework::Variable *dst,
                                             int ring_id, bool use_calc_stream) {
@@ -146,8 +167,6 @@ void BKCLParallelContext::WaitCompute(int ring_id) {
       platform::errors::OutOfRange("Ring id expected < nrings,"
                                    "but got ring id = %d, nrings = %d",
                                    ring_id, strategy_.nrings_));
-  // TODO(wangxi16): [Performance optimize] Maybe need to put Wait and
-  // bkcl_allreduce to comm thread, for bkcl_allreduce is blocking now.
   auto compute_dev_ctx = static_cast<platform::XPUDeviceContext *>(
       platform::DeviceContextPool::Instance().Get(place_));
   compute_dev_ctx->Wait();
@@ -167,6 +186,12 @@ void BKCLParallelContext::WaitComm(int ring_id) {
   comm_dev_ctx->Wait();
 }
 
+void BKCLParallelContext::SynchronizeCompute() {
+  auto compute_dev_ctx = static_cast<platform::XPUDeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  compute_dev_ctx->Wait();
+}
+
 }  //  namespace imperative
 }  //  namespace paddle
 #endif
diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h
index d7d917f20082ac6e49bae604cdde04343b4dced4..652b7689666c6c66c4efe6edda0c23acfc0cab27 100644
--- a/paddle/fluid/imperative/bkcl_context.h
+++ b/paddle/fluid/imperative/bkcl_context.h
@@ -36,6 +36,8 @@ class BKCLParallelContext : public ParallelContext {
 
   void Init() override;
 
+  void InitWithRingID(int ring_id) override;
+
   void AllReduceByStream(const framework::Variable& src,
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
@@ -45,6 +47,8 @@ class BKCLParallelContext : public ParallelContext {
   void WaitCompute(int ring_id) override;
 
   void WaitComm(int ring_id) override;
+
+  void SynchronizeCompute() override;
 };
 
 }  //  namespace imperative
diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc
index 71ea82e9a19e89965d5faf712fc5a9411c28db3e..c43149c9b563e73d7bdc8cbb39b8303083d2ac84 100644
--- a/paddle/fluid/imperative/data_loader.cc
+++ b/paddle/fluid/imperative/data_loader.cc
@@ -71,9 +71,12 @@ void EraseLoadProcessPIDs(int64_t key) {
     }                                                       \
   } while (0)
 
-#define REGISTER_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME)             \
-  static void HANDLER_NAME(int sig, siginfo_t *info, void *ctx) { \
-    SIGNAL_HANDLE(SIGNAL);                                        \
+#define REGISTER_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME, ERROR_MSG)           \
+  static void HANDLER_NAME(int sig, siginfo_t *info, void *ctx) {          \
+    auto _w =                                                              \
+        write(STDERR_FILENO, ERROR_MSG, sizeof(ERROR_MSG) / sizeof(char)); \
+    (void)_w;                                                              \
+    SIGNAL_HANDLE(SIGNAL);                                                 \
   }
 
 #define REGISTER_SPEC_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME)        \
@@ -84,8 +87,18 @@ void EraseLoadProcessPIDs(int64_t key) {
     SIGNAL_HANDLE(SIGNAL);                                        \
   }
 
-REGISTER_SIGNAL_HANDLER(SIGSEGV, SIGSEGV_handler);
-REGISTER_SIGNAL_HANDLER(SIGBUS, SIGBUS_handler);
+REGISTER_SIGNAL_HANDLER(SIGSEGV, SIGSEGV_handler,
+                        "ERROR: Unexpected segmentation fault encountered in "
+                        "DataLoader workers.\n");
+REGISTER_SIGNAL_HANDLER(
+    SIGBUS, SIGBUS_handler,
+    "ERROR: Unexpected BUS error encountered in DataLoader worker. "
+    "This might be caused by insufficient shared memory (shm), "
+    "please check whether use_shared_memory is set and storage space "
+    "in /dev/shm is enough\n");
+REGISTER_SIGNAL_HANDLER(SIGFPE, SIGFPE_handler,
+                        "ERROR: Unexpected floating-point exception "
+                        "encountered in DataLoader worker.\n")
 REGISTER_SPEC_SIGNAL_HANDLER(SIGTERM, SIGTERM_handler);
 
 static inline void setSignalHandler(int signal,
@@ -105,6 +118,7 @@ static inline void setSignalHandler(int signal,
 void SetLoadProcessSignalHandler() {
   setSignalHandler(SIGSEGV, &SIGSEGV_handler, nullptr);
   setSignalHandler(SIGBUS, &SIGBUS_handler, nullptr);
+  setSignalHandler(SIGFPE, &SIGFPE_handler, nullptr);
   setSignalHandler(SIGTERM, &SIGTERM_handler, nullptr);
 }
 
diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h
index a367840472827505a7eb48310eb57bfc0c01a3ee..7fefc9ccc67b52aab5073d3dd6c738ab07075e78 100644
--- a/paddle/fluid/imperative/dygraph_grad_maker.h
+++ b/paddle/fluid/imperative/dygraph_grad_maker.h
@@ -279,6 +279,8 @@ class TracedGradOp {
 
   void SetType(const std::string& type) { op_->SetType(type); }
 
+  const framework::OperatorBase& InnerOp() const { return op_->InnerOp(); }
+
   void SetAttrMap(const framework::AttributeMap& attrs) {
     return op_->SetAttrMap(attrs);
   }
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index deb504a1b657e4f348e47ea9a6e7b80029e109d4..43546cf99c69ffa3aa1f1a792e7b344ed0735a31 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -18,7 +18,6 @@
 #include <memory>
 #include <utility>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -116,6 +115,23 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  void operator()(const platform::NPUPlace& place) {
+    // TODO(zhiqiu): SUPPORT it
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#else
+  void operator()(const platform::NPUPlace& place) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Gradient accumulation on place (%s) "
+        "is not supported in imperative mode",
+        place));
+  }
+#endif
+
   // there is NO blas in CUDAPinnedPlace
   void operator()(const platform::CUDAPinnedPlace& place) {
     PADDLE_THROW(platform::errors::PermissionDenied(
@@ -385,8 +401,8 @@ static platform::Place GetPlaceOfVar(
 
 void GradientAccumulator::AccumulateGrad() {
   /**
-   * If the gradient has been calculated by previous graph,
-   * it should be added to the previous graph result.
+   * If the leaf gradient has been calculated done, the inner_var_
+   * should be added to the var_.
    */
   if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) {
     return;
@@ -397,7 +413,7 @@ void GradientAccumulator::AccumulateGrad() {
                         "this auto-grad"));
   PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true,
                     platform::errors::InvalidArgument(
-                        "Interior var of Leaf tensor  should be initialized."));
+                        "Interior var of Leaf tensor should be initialized."));
   auto* src = inner_var_->MutableVar();
   auto* dst = var_->MutableVar();
   if (!var_->IsEmpty()) {
@@ -428,10 +444,65 @@ void GradientAccumulator::AccumulateGrad() {
     *(dst) = std::move(*src);
     var_->SetType(inner_var_->Type());
     var_->SetDataType(inner_var_->DataType());
+    var_->SetIsEmpty(false);
   }
   inner_var_.reset();
 }
 
+void GradientAccumulator::CallGradientHooks() {
+  PADDLE_ENFORCE_EQ(var_->IsLeafGrad(), true,
+                    platform::errors::Unavailable(
+                        "Only leaf gradient Tensor can deal with by gradient "
+                        "hook in gradient accumulator."));
+  PADDLE_ENFORCE_EQ(
+      SumGradCompleted(), true,
+      platform::errors::PreconditionNotMet(
+          "Only can call gradient hooks after sum gradient completed."));
+  PADDLE_ENFORCE_EQ(
+      HasInnerVar(), true,
+      platform::errors::PreconditionNotMet(
+          "Leaf Tensor's inner var is nullptr when call gradient hook."));
+  PADDLE_ENFORCE_EQ(
+      inner_var_->Var().IsInitialized(), true,
+      platform::errors::PreconditionNotMet("Leaf Tensor's inner var "
+                                           "is not initialized when "
+                                           "call gradient hook."));
+  if (var_->HasVariableWrapperHook()) {
+    VLOG(3) << "Call " << var_->GetVariableWrapperHooks().size()
+            << " hooks of leaf gradient accumulator's inner var `"
+            << var_->Name() << "`.";
+    auto tmp_var = inner_var_;
+    VLOG(3) << "Input var " << var_->Name() << "'s hook size - "
+            << var_->GetVariableWrapperHooks().size();
+    for (const auto& hook_pair : var_->GetVariableWrapperHooks()) {
+      tmp_var = (*hook_pair.second)(tmp_var);
+    }
+    inner_var_ = tmp_var;
+  }
+}
+
+void GradientAccumulator::CallReduceHooks() {
+  PADDLE_ENFORCE_EQ(
+      var_->IsLeafGrad(), true,
+      platform::errors::Unavailable("Only leaf gradient Tensor can deal with "
+                                    "by reduce hook in gradient accumulator."));
+  PADDLE_ENFORCE_EQ(SumGradCompleted(), true,
+                    platform::errors::PreconditionNotMet(
+                        "Only can call reduce hooks after the gradient "
+                        "summation is completed in current batch."));
+  PADDLE_ENFORCE_EQ(HasInnerVar(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Only can call reduce hooks after the "
+                        "gradient accumulation is completed in "
+                        "current batch or across batchs."));
+  if (var_->HasVoidHook()) {
+    for (const auto& hook : var_->GetVoidHooks()) {
+      VLOG(3) << "call gradient accumulator backward hooks.";
+      (*hook)();
+    }
+  }
+}
+
 void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                                        size_t trace_id, bool unchange_input) {
   /**
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index e2dabc06a7dae6f326021dcdef7f3528661e787d..6411dce4405c11795418fb8334e26b32079e7596 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -40,8 +40,8 @@ class GradientAccumulator {
     }
 
     // inner_var_ record the grad of this auto-grad.
-    // Only need to generate inner var for non-empty leaf-tensor.
-    if (var->IsLeafGrad() && !var->IsEmpty()) {
+    // Only need to generate inner var for leaf-tensor.
+    if (var->IsLeafGrad()) {
       inner_var_ = std::make_shared<VariableWrapper>(var->Name());
       inner_var_->SetType(var->Type());
       inner_var_->SetDataType(var->DataType());
@@ -52,9 +52,6 @@ class GradientAccumulator {
               << ") to store result of this Graph";
     }
 
-    // TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag
-    var->SetIsEmpty(false);
-
     // var_ is the final grad, processed by hooks and grad accumulation
     var_ = var;
   }
@@ -93,42 +90,38 @@ class GradientAccumulator {
 
   inline bool HasInnerVar() const { return inner_var_ != nullptr; }
 
-  /* Hook related methods */
-  inline bool HasPostHooks() const { return !post_hooks_.expired(); }
-
-  void SetPostHooks(const std::shared_ptr<LeafVarHookPipeline>& hooks) {
-    PADDLE_ENFORCE_NOT_NULL(
-        hooks, platform::errors::InvalidArgument(
-                   "The hook set to GradientAccumulator is nullptr."));
-
-    auto shared_hooks = post_hooks_.lock();
-    if (shared_hooks != hooks) {
-      PADDLE_ENFORCE_EQ(
-          shared_hooks, nullptr,
-          platform::errors::PermissionDenied(
-              "Cannot set post hooks twice to GradientAccumulator."));
-      post_hooks_ = hooks;
-    }
-  }
-  // void CallHooks(){}
-  //  ** inner_var_ **
-
   // function that Sum Gradient with Previous Graph
   void AccumulateGrad();
 
-  // call backward post hooks, such as reduce hook
-  void CallBackwardPostHooks() {
-    PADDLE_ENFORCE_NE(
-        post_hooks_.expired(), true,
-        platform::errors::NotFound(
-            "The post hooks of GradientAccumulator for Tensor `%s` expired.",
-            var_->Name()));
-    auto shared_hooks = post_hooks_.lock();
-    for (const auto& hook : shared_hooks->backward_hooks()) {
-      VLOG(3) << "call gradient accumulator backward hooks.";
-      (*hook)(var_);
-    }
-  }
+  /** [ Hook related methods ]
+   *
+   *  [Why need two types of VariableWrapperHook? ]
+   *
+   *    There are two types of gradient accumulation:
+   *    1. Gradient accumulation in same batch
+   *    2. Gradient accumulation across batchs
+   *    The order of execution between Hooks and gradient accumulation:
+
+   *      [ Gradient accumulation in same batch]
+   *                        |
+   *            [ leaf GradVarBase hooks ]
+   *                        |
+   *      [ Gradient accumulation across batchs ]
+   *                        |
+   *          [ Gradient reduce / allreduce hooks ]
+
+   *    Because we currently intend to accumulate these two gradient
+   *    accumulation in one GradientAccumulator, We must distinguish between
+   *    two types of hooks.
+
+   *    And the InplaceVariableWrapperHook does not allow users to register
+   *    directly, and is currently only used to support the reduce strategy of
+   *    parallel multi-card training.
+   */
+
+  void CallGradientHooks();
+
+  void CallReduceHooks();
 
  protected:
   VariableWrapper* var_;
@@ -137,7 +130,6 @@ class GradientAccumulator {
   std::shared_ptr<VariableWrapper> inner_var_;
   size_t ref_cnt_{0};
   size_t cur_cnt_{0};
-  std::weak_ptr<LeafVarHookPipeline> post_hooks_;
 };
 
 class EagerGradientAccumulator : public GradientAccumulator {
diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h
index 1211ec6ae6c7bdd28a08687ed65e98b5741e2865..fa929b7c7a51c77eaf307ab2900f58fc452e6969 100644
--- a/paddle/fluid/imperative/hooks.h
+++ b/paddle/fluid/imperative/hooks.h
@@ -18,215 +18,63 @@
 #include <memory>
 #include <utility>
 #include <vector>
-
-#include "paddle/fluid/imperative/type_defs.h"
-#include "paddle/fluid/platform/macros.h"
-
 namespace paddle {
 namespace imperative {
 
 class VariableWrapper;
 
-/** [ Basic hook classes ]
- * s
- * @brief OpBasePreHook is executed before the grad OpBase is executed,
- *        taking the input of the current grad OpBase as input, and
- *        executing python hooks (user-defined) or C++ hooks (developer-defined)
- *        to achieve the purpose of custom operations on the interior VarBase
- *        gradient.
+/** [ VariableWrapper Hook ]
  *
- * @note  OpBasePreHook will not change the input gradient VarBase.
+ * @brief This hook functor is executed before the grad OpBase is executed or
+ *        after gradient accumulation completed in current batch.
+ *        1. For interior var, VariableWrapper Hook take the input of the
+ *        current grad OpBase as input.
+ *        2. For leaf var, VariableWrapper Hook take the inner_var_ of
+ *        GradientAccumulator as input.
  *
- * @note  [Why need to be OpBase `PreHook`, why not `PostHook`?]
+ * @note  This hook functor will not change the input gradient VariableWrapper,
+ *        but if you copy the input VariableWrapper and change the value of
+ *        Variable in VariableWrapper, the value of input will also be changed,
+ *        because they shared same PlaceHolder.
  *
- *        If set OpBase post hook, when the op executed end, the op's output
- *        gradient may not be the final state, because it may need other op's
- *        gradient output to accumulated to it. But before op can be executed,
- *        the gradient output must have been accumulated to final value.
+ * @note  [ Why need to be OpBase `PreHook`, why not `PostHook`? ]
  *
- * @note  [Why only can be used for interior VarBase?]
+ *        We expect If set OpBase post hook, when the op executed end, the
+ *        op's output gradient may not be the final state, because it may need
+ *        other op's gradient output to accumulated to it. But before op can
+ *        be executed, the gradient output must have been accumulated to final
+ *        value.
+ *
+ * @note  [ Why Leaf gradient is special? ]
  *
  *        Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf
  *        GradVarBase has no next OpBase to executed, so if need to deal with
- *        the leaf GradVarBase, cannot use OpBasePreHook. For this case, we
- *        deal with by GradAccumulatorPostHook.
+ *        the leaf GradVarBase, we should call hooks after gradient accumulation
+ *        completed.
  */
-class OpBasePreHook {
+class VariableWrapperHook {
  public:
-  virtual ~OpBasePreHook() = default;
-  virtual VariableWrapperList operator()(
-      const VariableWrapperList& grad_inputs) = 0;
+  virtual ~VariableWrapperHook() = default;
+  virtual std::shared_ptr<VariableWrapper> operator()(
+      const std::shared_ptr<VariableWrapper>& var) = 0;
 };
 
-/**
- * @brief GradAccumulatorPostHook is the Hook that operates on the current
- *        gradientafter the GradientAccumulator has accumulated the gradient.
- *        Leaf GradVarBase has no next OpBase, if we want to register hook
- *        for it, we also need to wait until the leaf GradVarBase accumulation
- *        is completed, so we can add post hook to GradientAccumulator.
- *
- * @note  GradAccumulatorPostHook will change the grad VarBase value.
- *
- * @note  Only allow leaf VarBase hold GradientAccumulatorPostHook.
- */
-class GradAccumulatorPostHook {
- public:
-  virtual ~GradAccumulatorPostHook() = default;
-  virtual void operator()(VariableWrapper* var) = 0;
-};
-
-/** [ Hook for cpp functions ]
- *
- * Here we design three C++ hooks；
- * 1. CppOpBasePreHook (Implement later):
- *    - used for developer-defined C++ interior VarBase hooks
- * 2. CppGradAccumulatorPostHook (Implement later):
- *    - used for developer-defined C++ leaf VarBase hooks
- * 3. LambdaGradAccumulatorPostHook:
- *    - used for VarBase reduce in parallel training
- *
- * @note  [Why need two types of GradAccumulatorPostHook? ]
- *
- *        There are two types of gradient accumulation:
- *        1. Gradient accumulation in same batch
- *        2. Gradient accumulation across batchs
- *        The order of execution between Hooks and gradient accumulation:
- *
- *          [ Gradient accumulation in same batch]
- *                            |
- *                [ leaf GradVarBase hooks ]
- *                            |
- *          [ Gradient accumulation across batchs ]
- *                            |
- *              [ Gradient reduce / allreduce]
- *
- *        Because we currently intend to accumulate these two gradient
- *        accumulation in one GradientAccumulator, We must distinguish between
- *        two types of hooks.
- *
- *        And the LambdaGradAccumulatorPostHook does not allow users to register
- *        directly, and is currently only used to support the reduce strategy of
- *        parallel multi-card training.
- */
-class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook {
+class CppVariableWrapperHook : public VariableWrapperHook {
  public:
-  explicit LambdaGradAccumulatorPostHook(
-      std::function<void(VariableWrapper*)> fn)
+  explicit CppVariableWrapperHook(
+      std::function<std::shared_ptr<VariableWrapper>(
+          const std::shared_ptr<VariableWrapper>&)>&& fn)
       : fn_(std::move(fn)) {}
 
-  void operator()(VariableWrapper* var) override { fn_(var); }
-
- private:
-  std::function<void(VariableWrapper*)> fn_;
-};
-
-/* Hooks for python function: in pybind/imperative.cc */
-
-/** Add Python Hooks later:
- * - PyOpBasePreHook (Implement later): used for user-defined interior python
- * VarBase hooks
- * - PyGradAccumulatorPostHook (Implement later): used for user-defined leaf
- * python VarBase hooks
- */
-
-/** [ Hook Pipeline classes ]
- *
- * @note  [Why need hook pipeline classes?]
- *
- *        There are 2 purposes for adding Hook pipeline here:
- *
- *        1. Make the code implementation cleaner.
- *
- *          If there are no Hook pipeline, we need to add 3 hook vector into
- *          VariableWrapper, 1 hook vector into OpBase, 2 hook vector into
- *          GradientAccumulator, like:
- *
- *          - VariableWrapper:
- *            std::vector<std::shared_ptr<OpBasePreHook>>
- *              interior_var_hooks_;
- *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
- *              leaf_var_hooks_;
- *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
- *              backward_hooks_;
- *
- *          - OpBase:
- *            std::vector<std::weak_ptr<OpBasePreHook>>
- *              interior_var_hooks_;
- *
- *          - GradientAccumulator:
- *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
- *              leaf_var_hooks_;
- *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
- *              backward_hooks_;
- *
- *          This seems more complicated, and std::vector<std::weak_ptr<...>>
- *          is not easy to destruct.
- *
- *        2. Make the code easier to understand.
- *
- *          From these two packages, we can clearly understand that we
- *          have two types of Hooks, respectively for the interior
- *          gradient var and leaf gradient var inside the backward
- *          calculation graph.
- */
-
-class InteriorVarHookPipeline {
- public:
-  InteriorVarHookPipeline() = default;
-
-  void add_hook(std::unique_ptr<OpBasePreHook>&& hook) {
-    hooks_.emplace_back(std::move(hook));
+  std::shared_ptr<VariableWrapper> operator()(
+      const std::shared_ptr<VariableWrapper>& var) override {
+    return fn_(var);
   }
 
-  const std::vector<std::unique_ptr<OpBasePreHook>>& hooks() const {
-    return hooks_;
-  }
-
-  std::vector<std::unique_ptr<OpBasePreHook>>& hooks() { return hooks_; }
-
  private:
-  std::vector<std::unique_ptr<OpBasePreHook>> hooks_;
-
-  DISABLE_COPY_AND_ASSIGN(InteriorVarHookPipeline);
-};
-
-class LeafVarHookPipeline {
- public:
-  LeafVarHookPipeline() = default;
-
-  void add_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() const {
-    return hooks_;
-  }
-
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() {
-    return hooks_;
-  }
-
-  void add_backward_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    backward_hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks()
-      const {
-    return backward_hooks_;
-  }
-
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks() {
-    return backward_hooks_;
-  }
-
- private:
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>> hooks_;
-  // NOTE: the `backward` here means the `whole backward process`,
-  // the `backward_hooks_` need to be executed after the `whole backward
-  // process`.
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>> backward_hooks_;
-
-  DISABLE_COPY_AND_ASSIGN(LeafVarHookPipeline);
+  std::function<std::shared_ptr<VariableWrapper>(
+      const std::shared_ptr<VariableWrapper>&)>
+      fn_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc
index 53750f7bf02be206d8f6524bf1b1894c570978d1..1a44f50275ef8f524f4291468cb25b5a4bd59e85 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.cc
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc
@@ -69,6 +69,7 @@ UniqueBlockVarGenerator::UniqueBlockVarGenerator(
 
 std::string UniqueBlockVarGenerator::NameOf(const std::weak_ptr<VarBase> &var,
                                             const std::string &prefix) {
+  VLOG(3) << "Finding: " << var.lock()->Name();
   auto all_vars_iter = all_vars_.find(var);
   PADDLE_ENFORCE_EQ(all_vars_iter != all_vars_.end(), true,
                     platform::errors::NotFound(
@@ -111,6 +112,15 @@ void UniqueBlockVarGenerator::InsertNewVarInBlock(
   }
 }
 
+bool ProgramDescTracer::ContainVar(const std::weak_ptr<VarBase> &var) const {
+  auto vars_iter = vars_.find(var);
+  bool ret = (vars_iter != vars_.end());
+  if (!ret) {
+    VLOG(5) << "Can't found variable: " << var.lock()->Name();
+  }
+  return ret;
+}
+
 void ProgramDescTracer::InsertOp(const std::string &type,
                                  const NameVarBaseMap &inputs,
                                  const NameVarBaseMap &outputs,
@@ -147,12 +157,16 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
 
   std::vector<std::string> feed_var_names;
   for (auto &feed_var : feed_vars) {
-    feed_var_names.emplace_back(generator.NameOf(feed_var, feed_prefix));
+    if (ContainVar(feed_var)) {
+      feed_var_names.emplace_back(generator.NameOf(feed_var, feed_prefix));
+    }
   }
 
   std::vector<std::string> fetch_var_names;
   for (auto &fetch_var : fetch_vars) {
-    fetch_var_names.emplace_back(generator.NameOf(fetch_var, fetch_prefix));
+    if (ContainVar(fetch_var)) {
+      fetch_var_names.emplace_back(generator.NameOf(fetch_var, fetch_prefix));
+    }
   }
 
   for (auto &op : ops_) {
@@ -164,7 +178,9 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
       std::vector<std::string> names;
       names.reserve(pair.second.size());
       for (auto &var : pair.second) {
-        names.emplace_back(generator.NameOf(var, tmp_prefix));
+        if (ContainVar(var)) {
+          names.emplace_back(generator.NameOf(var, tmp_prefix));
+        }
       }
 
       op_desc->SetInput(pair.first, std::move(names));
@@ -174,7 +190,9 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc(
       std::vector<std::string> names;
       names.reserve(pair.second.size());
       for (auto &var : pair.second) {
-        names.emplace_back(generator.NameOf(var, tmp_prefix));
+        if (ContainVar(var)) {
+          names.emplace_back(generator.NameOf(var, tmp_prefix));
+        }
       }
 
       op_desc->SetOutput(pair.first, std::move(names));
diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.h b/paddle/fluid/imperative/jit/program_desc_tracer.h
index 8e2e59a49ed7be473a1f89aaefde6bf123a9dea9..b231efb0e53a515fcd2e6c58c62e49f7bdccf1db 100644
--- a/paddle/fluid/imperative/jit/program_desc_tracer.h
+++ b/paddle/fluid/imperative/jit/program_desc_tracer.h
@@ -66,7 +66,7 @@ class ProgramDescTracer {
       const std::string &feed_prefix,
       const std::vector<std::shared_ptr<VarBase>> &fetch_vars,
       const std::string &fetch_prefix, const std::string &tmp_prefix) const;
-
+  bool ContainVar(const std::weak_ptr<VarBase> &var) const;
   void Reset();
 
  private:
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 062f04c6b7052f8ce9032df5809f5ada86e4b777..a4af3117d3e32ea8db37881bef9c4423ba0173ca 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -187,6 +187,7 @@ size_t VarBase::GradOpNum() const {
 }
 
 void VarBase::ClearGradient() {
+  VLOG(4) << "ClearGradient " << Name();
   if (grad_var_) {
     if (grad_var_->Var().IsType<framework::SelectedRows>()) {
       auto* grad_t =
@@ -406,7 +407,7 @@ void OpBase::Run(const framework::OperatorBase& op,
   OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, place);
 }
 
-static void ClearNoNeedBufferInputs(OpBase* op) {
+void ClearNoNeedBufferInputs(OpBase* op) {
   auto& inferer = op->Info().NoNeedBufferVarsInferer();
   if (!inferer) return;
   auto* ins = op->GetMutableInsMap();
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index ff5a780a5f9dbf4ae7799d5b249838362310fc08..bbede47e36429887b70c7a7310176c38f6d41a52 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -30,6 +30,7 @@
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/flags.h"
+#include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/saved_variable_wrapper_list.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
@@ -107,6 +108,10 @@ class VarBase {
 
   void ClearGradVarBase() { grad_var_ = nullptr; }
 
+  void SetGradVarBase(VarBase& grad_var) {
+    MutableGradVarBase()->CopyFrom(grad_var, true);
+  }
+
   const std::shared_ptr<VarBase>& MutableGradVarBase() {
     if (grad_var_ == nullptr) {
       if (auto grad_var_wrapper = var_->GetGradVar()) {
@@ -220,6 +225,28 @@ class VarBase {
 
   void BumpInplaceVersion();
 
+  /* Hook related method: now only used for GradVarBase */
+  bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); }
+
+  int64_t AddVariableWrapperHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    return var_->AddVariableWrapperHook(
+        std::forward<std::shared_ptr<VariableWrapperHook>>(hook));
+  }
+
+  bool RemoveVariableWrapperHook(const int64_t& hook_id) {
+    return var_->RemoveVariableWrapperHook(hook_id);
+  }
+
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>&
+  GetVariableWrapperHooks() const {
+    return var_->GetVariableWrapperHooks();
+  }
+
+  void AddVoidHook(std::shared_ptr<std::function<void()>>&& hook) {
+    var_->AddVoidHook(
+        std::forward<std::shared_ptr<std::function<void()>>>(hook));
+  }
+
  private:
   /**
    * NOTE(zengjinle): never remove the const qualifier of `var_` if you are
@@ -259,5 +286,7 @@ std::shared_ptr<GradOpNode> CreateGradOpNode(
     const platform::Place& place,
     const std::map<std::string, std::string>& inplace_map);
 
+void ClearNoNeedBufferInputs(OpBase* op);
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index eb0135d15e0743ef003b846a8a60a24385be7eea..9f036742f0f5dd4113a92a67980484eca2da3965 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -35,7 +35,7 @@ namespace imperative {
 
 void NCCLParallelContext::BcastNCCLId(
     std::vector<ncclUniqueId> &nccl_ids,  // NOLINT
-    int root) {
+    int root, int server_fd) {
   if (strategy_.local_rank_ == root) {
     std::vector<std::string> other_trainers;
     for (auto &ep : strategy_.trainer_endpoints_) {
@@ -45,11 +45,14 @@ void NCCLParallelContext::BcastNCCLId(
     }
     platform::SendBroadCastCommID(other_trainers, &nccl_ids);
   } else {
-    platform::RecvBroadCastCommID(strategy_.current_endpoint_, &nccl_ids);
+    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
+                                  &nccl_ids);
   }
 }
 
 void NCCLParallelContext::Init() {
+  int server_fd = -1;
+
   std::vector<ncclUniqueId> nccl_ids;
   nccl_ids.resize(strategy_.nrings_);
 
@@ -58,8 +61,13 @@ void NCCLParallelContext::Init() {
     for (size_t i = 0; i < nccl_ids.size(); ++i) {
       platform::dynload::ncclGetUniqueId(&nccl_ids[i]);
     }
+  } else {
+    // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server
+    // on rank0.
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
   }
-  BcastNCCLId(nccl_ids, 0);
+  BcastNCCLId(nccl_ids, 0, server_fd);
 
   int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
   for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
@@ -79,6 +87,36 @@ void NCCLParallelContext::Init() {
   }
 }
 
+void NCCLParallelContext::InitWithRingID(int ring_id) {
+  int server_fd = -1;
+  std::vector<ncclUniqueId> nccl_ids;
+  nccl_ids.resize(1);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique ncclid on the root worker
+    platform::dynload::ncclGetUniqueId(&nccl_ids[0]);
+  } else {
+    // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server
+    // on rank0.
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
+  }
+  BcastNCCLId(nccl_ids, 0, server_fd);
+
+  int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+  VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
+          << " ring id: " << ring_id;
+  // it will assign nccl_comm in CUDADeviceContext within ring_id
+  platform::NCCLCommContext::Instance().CreateNCCLComm(
+      &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
+
+  compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
+      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+  comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
+      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+}
+
 void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
                                             framework::Variable *dst,
                                             int ring_id, bool use_calc_stream) {
@@ -149,6 +187,12 @@ void NCCLParallelContext::WaitComm(int ring_id) {
 #endif
 }
 
+void NCCLParallelContext::SynchronizeCompute() {
+  auto *compute_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  compute_dev_ctx->Wait();
+}
+
 #endif
 
 }  //  namespace imperative
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index 51e5743aebdc3dd333f40f8ec59d1bb35f620843..1eee393aa714bb21ab77ee6668d719b93787981f 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -49,10 +49,13 @@ class NCCLParallelContext : public ParallelContext {
 
   ~NCCLParallelContext() override = default;
 
-  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root);  // NOLINT
+  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root,  // NOLINT
+                   int server_fd);
 
   void Init() override;
 
+  void InitWithRingID(int ring_id) override;
+
   void AllReduceByStream(const framework::Variable& src,
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
@@ -63,6 +66,8 @@ class NCCLParallelContext : public ParallelContext {
 
   void WaitComm(int ring_id) override;
 
+  void SynchronizeCompute() override;
+
  private:
   // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
   std::vector<std::shared_ptr<platform::CudaEventObject>> compute_events_;
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index 2b7642ae7cfd92fd323dfdfeced08102bf942d42..0164ff9313cdfe2344f98610602a6bd40a5e903a 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -177,8 +177,6 @@ class OpBase {
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   size_t id_{-1UL};
-
-  std::weak_ptr<InteriorVarHookPipeline> pre_hooks_;
 };
 
 class GradOpNode {
diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h
index ef0a960409215181ec7b7bc38e5ee55b540502b9..f537a316014d60ed18250d72de3ec2b7dd95cf05 100644
--- a/paddle/fluid/imperative/parallel_context.h
+++ b/paddle/fluid/imperative/parallel_context.h
@@ -50,6 +50,8 @@ class ParallelContext {
 
   virtual void Init() = 0;
 
+  virtual void InitWithRingID(int ring_id) = 0;
+
   virtual void AllReduceByStream(const framework::Variable& src,
                                  framework::Variable* dst, int ring_id,
                                  bool use_calc_stream) = 0;
@@ -64,6 +66,9 @@ class ParallelContext {
   // if CPU, should do nothing.
   virtual void WaitComm(int ring_id) = 0;
 
+  // synchorize compute stream
+  virtual void SynchronizeCompute() = 0;
+
   inline int GetNRings() const { return strategy_.nrings_; }
 
   inline int64_t GetNRanks() const { return strategy_.nranks_; }
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 8dd8cafc835ab17b05258fcdd7f9393584329da4..3da3a05ed1071cae20cf16ebfed6f6310937daae 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -369,6 +369,10 @@ class GradientAccumulationInfo {
     *is_finished = (cur_ref_cnt_ == total_ref_cnt_);
     accumulator_->SumGrad(grad_var_partial, trace_id, unchange_input);
 
+    if (*is_finished && accumulator_->HasInnerVar()) {
+      accumulator_->AccumulateGrad();
+    }
+
     if (create_graph_) {
       VLOG(10) << "Store partial grad grad for double grad "
                << mapped_grad_var_->Name();
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..de5f9d75e9173a7d39c113b881e078dc43c83f39
--- /dev/null
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/tracer.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/operators/py_layer_op.h"
+
+namespace paddle {
+namespace imperative {
+
+namespace py = ::pybind11;
+
+bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
+  for (const auto& name_pair : ins) {
+    for (const auto& var_base : name_pair.second) {
+      if (!var_base->OverridedStopGradient()) {
+        PassStopGradient(outs, var_base->OverridedStopGradient());
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+std::shared_ptr<GradOpNode> CreateGradOpNode(
+    const std::string& type, const NameVarBaseMap& ins,
+    const NameVarBaseMap& outs, const framework::AttributeMap& attrs,
+    const platform::Place& place,
+    const std::map<std::string, std::string>& inplace_map,
+    const std::shared_ptr<operators::PyLayerContext>& py_context) {
+  operators::PyLayerGradOpMaker<paddle::imperative::OpBase> maker(
+      type, ins, outs, attrs, inplace_map);
+
+  maker.SetPyLayerContext(py_context);
+  auto grad_node = maker();
+  if (grad_node && !grad_node->empty()) {
+    for (auto& grad_op : *grad_node) {
+      grad_op.SetId(OpBase::GenerateUniqueId());
+      grad_op.SetPlace(place);
+      ClearNoNeedBufferInputs(&grad_op);
+    }
+    return grad_node;
+  } else {
+    return nullptr;
+  }
+}
+
+py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
+                        const py::args args, const py::kwargs kwargs) {
+  py::gil_scoped_acquire guard;
+  auto bk_function = cls.attr("_backward_function");
+  auto context = bk_function();
+  auto forward = cls.attr("forward");
+
+  auto result_forward = forward(context, *args, **kwargs);
+  std::shared_ptr<operators::PyLayerContext> py_layer_ctx =
+      std::make_shared<operators::PyLayerContext>(context.ptr());
+  // make inputs to varbase
+  std::vector<std::shared_ptr<imperative::VarBase>> input_vars;
+  // process args,`input_vars` only collect `imperative::VarBase`
+  if (!args.empty()) {
+    for (auto ptr = args.begin(); ptr != args.end(); ptr++) {
+      try {
+        if (Py_None != ptr->ptr()) {
+          auto a = ptr->cast<std::shared_ptr<VarBase>>();
+          input_vars.push_back(a);
+        }
+      } catch (py::cast_error& err) {
+        // Only collect Tensor type in 'args' and pass them to backward. Ignore
+        // other types of input temporarily.
+      }
+    }
+  }
+  // process kwargs, only collect `imperative::VarBase`
+  if (!kwargs.empty()) {
+    for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) {
+      try {
+        if (Py_None != ptr->second.ptr()) {
+          auto a = ptr->second.cast<std::shared_ptr<VarBase>>();
+          input_vars.push_back(a);
+        }
+      } catch (py::cast_error&) {
+        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        // Ignore other types of input temporarily.
+      }
+    }
+  }
+  NameVarBaseMap ins = {{"X", input_vars}};
+
+  std::vector<std::shared_ptr<imperative::VarBase>> output_vars;
+  if (PyTuple_Check(result_forward.ptr()) ||
+      PyList_Check(result_forward.ptr())) {
+    auto tuple_result = result_forward.cast<py::tuple>();
+    for (size_t i = 0; i < tuple_result.size(); i++) {
+      if (Py_None != tuple_result[i].ptr()) {
+        try {
+          auto temp_out =
+              tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>();
+          output_vars.push_back(temp_out);
+        } catch (py::cast_error&) {
+          // Only collect Tensor type in 'kwargs' and pass them to backward.
+          // Ignore other types of input temporarily.
+        }
+      } else {
+        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        // Ignore other types of input temporarily.
+      }
+    }
+  } else {
+    if (Py_None != result_forward.ptr()) {
+      try {
+        auto temp_out =
+            result_forward.cast<std::shared_ptr<imperative::VarBase>>();
+        output_vars.push_back(temp_out);
+      } catch (py::cast_error&) {
+        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        // Ignore other types of input temporarily.
+      }
+    } else {
+      // Only collect Tensor type in 'kwargs' and pass them to backward.
+      // Ignore other types of input temporarily.
+    }
+  }
+  if (output_vars.size() == 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "At least one output of `PyLayer.forward` is a `Tensor`."));
+  }
+
+  NameVarBaseMap outs = {{"Out", output_vars}};
+
+  if (RequiredGrad(ins, outs)) {
+    std::map<std::string, std::string> inplace_map{};
+    bool if_inplace = false;
+    for (auto temp_ins : input_vars) {
+      if (if_inplace) {
+        break;
+      }
+      for (auto temp_outs : output_vars) {
+        if (temp_ins->Name() == temp_outs->Name()) {
+          if_inplace = true;
+          break;
+        }
+      }
+    }
+    if (if_inplace) {
+      inplace_map["X"] = "Out";
+    }
+
+    CreateGradOpNode("py_layer", ins, outs, {{}}, place, inplace_map,
+                     py_layer_ctx);
+  } else {
+    VLOG(3) << "No Grad to track for Op: py_layer_op";
+  }
+
+  return result_forward;
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index e8b531d35cabfce8873f0f5b4142d95582188de4..0f6676ed48f349c7aa8d66459f7c74355bf53a9b 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -297,7 +297,7 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
       is_sparse_gradient_(is_sparse_gradient),
       parallel_ctx_(parallel_ctx),
       group_size_limits_(group_size_limits),
-      find_unused_vars_(find_unused_vars) {
+      find_unused_vars_each_step_(find_unused_vars) {
   VLOG(3) << "Start construct the Reducer ...";
   nrings_ = parallel_ctx->GetNRings();
   nranks_ = parallel_ctx->GetNRanks();
@@ -310,13 +310,16 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
   for (size_t global_var_index = 0; global_var_index < vars_.size();
        ++global_var_index) {
     auto var = vars_[global_var_index];
-    var->SharedVar()->AddGradVarLeafBackwardHook(
-        std::unique_ptr<LambdaGradAccumulatorPostHook>(
-            new LambdaGradAccumulatorPostHook([=](VariableWrapper *grad) {
-              this->AddDistHook(global_var_index);
-            })));
+    var->GradVarBase()->AddVoidHook(std::make_shared<std::function<void()>>(
+        [=]() { this->AddDistHook(global_var_index); }));
     var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index;
   }
+
+  // for checking var is ready once
+  vars_marked_ready_.resize(vars_.size(), false);
+
+  // Initialize local used vars
+  local_used_vars_.resize(vars_.size(), 0);
 }
 
 void Reducer::InitializeDenseGroups(
@@ -325,7 +328,7 @@ void Reducer::InitializeDenseGroups(
   for (size_t index = 0; index < variable_indices_.size(); ++index) {
     const auto variable_index = variable_indices_[index];
     const auto &var = vars_[variable_index];
-    const auto var_name = var->Name();
+    const auto &var_name = var->Name();
     PADDLE_ENFORCE_EQ(is_sparse_gradient_[variable_index], false,
                       platform::errors::PreconditionNotMet(
                           "Tensor %s's GRAD must be LoDTensor, but received "
@@ -336,7 +339,7 @@ void Reducer::InitializeDenseGroups(
     PADDLE_ENFORCE_EQ(lod_tensor->IsInitialized(), true,
                       platform::errors::PreconditionNotMet(
                           "Tensor %s is not initialized.", var_name));
-    auto size = lod_tensor->numel();
+    const auto size = lod_tensor->numel();
     PADDLE_ENFORCE_GT(
         size, 0, platform::errors::PreconditionNotMet(
                      "The number of tensor %s's elements is 0.", var_name));
@@ -348,8 +351,8 @@ void Reducer::InitializeDenseGroups(
     p_group->dense_tensors_.push_back(framework::Tensor());
 
     // check the dtype and place, it must be same.
-    auto dtype = var->DataType();
-    auto place = var->Place();
+    const auto &dtype = var->DataType();
+    const auto &place = var->Place();
     if (index > 0) {
       PADDLE_ENFORCE_EQ(
           dtype, p_group->dtype_,
@@ -419,8 +422,7 @@ void Reducer::InitializeGroups(
     group.variable_indices_ = std::move(variable_indices_);
     groups_.emplace_back(std::move(group));
     // Debug Message For Reducer
-    VLOG(3) << "The Group[" << group_index << "]:";
-    VLOG(3) << groups_.back();
+    VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
   }
 }
 
@@ -441,10 +443,6 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
     auto *cur_node = q.front();
     q.pop();
 
-    for (auto &cur_op : *cur_node) {
-      cur_op.EnforceHasInOut();
-    }
-
     const auto &grad_pending_nodes = cur_node->GradPendingNodes();
     for (auto &grad_pending_node : grad_pending_nodes) {
       PADDLE_ENFORCE_NOT_NULL(
@@ -459,38 +457,8 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
   }
 }
 
-// After each batch is calculated, the counter of each group(group.pending_)
-// and allreudce sequence counter(next_group_) will be cleaned up again.
-void Reducer::PrepareForBackward(
+void Reducer::TraverseBackwardGraph(
     const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
-  VLOG(3) << "start reseting count..";
-  next_group_ = 0;
-  std::for_each(groups_.begin(), groups_.end(), [](Group &group) {
-    group.pending_ = group.variable_indices_.size();
-    group.sparse_contents_ = nullptr;
-  });
-
-  PADDLE_ENFORCE_EQ(
-      all_group_ready_, false,
-      platform::errors::PreconditionNotMet(
-          "Please note that all forward outputs derived from the module "
-          "parameters must participate in the calculation of losses and "
-          "subsequent gradient calculations. If not, the wrapper will hang, "
-          "waiting for autograd to generate gradients for these parameters. "
-          "you can use detach or stop_gradient to make the unused parameters "
-          "detached from the autograd graph."));
-
-  // The first var to trigger the unused parameter
-  has_marked_unused_vars_ = false;
-  if (!find_unused_vars_) {
-    return;
-  }
-
-  // TODO(shenliang03) "find_unused_vars" interface will be exposed in the
-  // future to handle control flow to process unused parameters
-  find_unused_vars_ = false;
-
-  unused_vars_.clear();
   node_deps_.clear();
   std::queue<std::shared_ptr<GradOpNode>> q;
   std::unordered_set<VariableWrapper *> var_visited;
@@ -517,7 +485,6 @@ void Reducer::PrepareForBackward(
     q.pop();
 
     for (const auto &cur_op : *cur_node) {
-      cur_op.EnforceHasInOut();
       auto &bwd_outs = cur_op.GetOutsMap();
       for (const auto &pair : bwd_outs) {
         if (!pair.second.IsGrad()) {
@@ -555,6 +522,67 @@ void Reducer::PrepareForBackward(
   }
 }
 
+// After each batch is calculated, the counter of each group(group.pending_)
+// and allreudce sequence counter(next_group_) will be cleaned up again.
+void Reducer::PrepareForBackward(
+    const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
+  VLOG(3) << "after forward, then reset count for backward.";
+  next_group_ = 0;
+  std::for_each(groups_.begin(), groups_.end(), [](Group &group) {
+    group.pending_ = group.variable_indices_.size();
+    group.sparse_contents_ = nullptr;
+  });
+
+  // reinitialize vars_marked_ready_ for next iteration
+  vars_marked_ready_.clear();
+  vars_marked_ready_.resize(vars_.size(), false);
+
+  PADDLE_ENFORCE_EQ(
+      groups_need_finalize_, false,
+      platform::errors::PreconditionNotMet(
+          "A serious error has occurred here. Please "
+          "set find_unused_parameters=True to traverse backward graph "
+          "in each step to prepare reduce in advance. If you have "
+          "set, There may be several reasons for this error: "
+          "1) Please note that all forward outputs derived from the module "
+          "parameters must participate in the calculation of losses and "
+          "subsequent gradient calculations. If not, the wrapper will hang, "
+          "waiting for autograd to generate gradients for these parameters. "
+          "you can use detach or stop_gradient to make the unused parameters "
+          "detached from the autograd graph. "
+          "2) Used multiple forwards and one backward. You may be able to wrap "
+          "multiple forwards in a model."));
+
+  // The first var to trigger the unused parameter
+  has_marked_unused_vars_ = false;
+
+  if (find_unused_vars_once_ || find_unused_vars_each_step_) {
+    unused_vars_.clear();
+    TraverseBackwardGraph(outputs);
+    // only check once in first step
+    find_unused_vars_once_ = false;
+  }
+
+  if (find_unused_vars_each_step_ && unused_vars_.empty()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "All parameters are involved in the backward pass. "
+           "It is recommended to set find_unused_parameters to False "
+           "to improve performance. However, if unused parameters "
+           "appear in subsequent iterative training, then an error "
+           "will occur. Please make it clear that in the subsequent "
+           "training, there will be no parameters that are not used "
+           "in the backward pass, and then set find_unused_parameters";
+  }
+
+  if (unused_vars_.size() == vars_.size()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "There is no parameter in the device involved "
+           "in the backward calculation. If there are "
+           "parameters on other devices involved in the "
+           "backward, then a serious error will occur here.";
+  }
+}
+
 // Add hook function to each leaf node. When the gradient of a leaf node is
 // generated, if it is the sparse parameter, it will directly execute allreduce,
 // if it is the dense parameter, it will execute three steps: 1,
@@ -565,67 +593,141 @@ void Reducer::PrepareForBackward(
 // concat + allreduce + split is emitted in turn according to next_group_.
 // 3, FinalizeBackward: after the end, synchronize each stream.
 void Reducer::AddDistHook(size_t var_index) {
+  PADDLE_ENFORCE_LT(var_index, variable_locators_.size(),
+                    platform::errors::OutOfRange(
+                        "Out of bounds variable index. it must be less"
+                        "than %d, but it is %d",
+                        variable_locators_.size(), var_index));
+
   VLOG(3) << "Var[" << var_index << "] ["
           << vars_[var_index]->GradVarBase()->Name()
           << "] arrived and triggered disthook";
+
+  local_used_vars_[var_index] = 1;
+
+  // rebuild group when find_unused_vars_each_step_ is false
+  if (NeedRebuildGroup()) {
+    rebuild_vars_.push_back(vars_[var_index]);
+    rebuild_var_indices_.push_back(var_index);
+  }
+
   if (!has_marked_unused_vars_) {
     has_marked_unused_vars_ = true;
-    for (auto unused_index : unused_vars_) {
-      if (NeedRebuildGroup()) {
-        rebuild_vars_.push_back(vars_[unused_index]);
-        rebuild_var_indices_.push_back(unused_index);
-      }
+    for (const auto &unused_index : unused_vars_) {
       MarkVarReady(unused_index, false);
     }
   }
 
-  if (NeedRebuildGroup()) {
-    rebuild_vars_.push_back(vars_[var_index]);
-    rebuild_var_indices_.push_back(var_index);
-  }
   MarkVarReady(var_index, true);
 }
 
 void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
-  all_group_ready_ = true;
+  groups_need_finalize_ = true;
+
   const auto &var_locator = variable_locators_[var_index];
-  auto group_index = var_locator.group_index;
+  const auto group_index = var_locator.group_index;
   auto &group = groups_[group_index];
 
+  // error happened, if the var is ready before.
+  if (vars_marked_ready_[var_index]) {
+    auto error_info = string::Sprintf(
+        "Error happened, when parameter[%d][%s] has been ready before. "
+        "Please set find_unused_parameters=True to traverse backward graph "
+        "in each step to prepare reduce in advance. If you have set, "
+        "there may be several reasons for this error: "
+        "1) In multiple reentrant backward phase, some parameters are reused."
+        "2) Using model parameters outside of forward function. Please "
+        "make sure that model parameters are not shared in concurrent "
+        "forward-backward passes.",
+        var_index, vars_[var_index]->GradVarBase()->Name());
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false,
+                      platform::errors::PreconditionNotMet(error_info));
+
+    error_info +=
+        "3) Unused parameters retrieval is incorrect. "
+        "The return value of forward will be used to retrieve"
+        " the unused parameters of the entire model. These "
+        "gradients of unused parameters will not be synchronized "
+        "between multiple cards. However, if the unused "
+        "parameters participate in the backward calculation "
+        "again at a later time (e.g. after the forward function, "
+        "the loss calculation uses the unused "
+        "paramters of the forward and trigger backward), "
+        "its gradient will be wrong.";
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true,
+                      platform::errors::PreconditionNotMet(error_info));
+  } else {
+    vars_marked_ready_[var_index] = true;
+  }
+
   if (!group.is_sparse_) {
     // process dense group
-    auto inside_group_index = var_locator.inside_group_index;
-    auto length = group.length_[inside_group_index];
+    const auto inside_group_index = var_locator.inside_group_index;
+    const auto length = group.length_[inside_group_index];
     auto &group_tensor = group.dense_tensors_[inside_group_index];
+
     if (is_used_var) {
-      auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar();
-      auto tensor =
-          var_warpper->MutableVar()->GetMutable<framework::LoDTensor>();
+      auto var_base = vars_[var_index]->GradVarBase();
+      auto tensor = var_base->MutableVar()->GetMutable<framework::LoDTensor>();
       group_tensor.ShareDataWith(*tensor).Resize(
           {static_cast<int64_t>(length)});
     } else {
+      // TODO(shenliang03): maybe save the memory
+      // by avoiding tensor construction
       if (!group_tensor.IsInitialized()) {
         group_tensor.Resize({static_cast<int64_t>(length)});
         group_tensor.mutable_data(place_, group.dtype_);
+      }
+
 #ifdef PADDLE_WITH_XPU_BKCL
-        if (platform::is_xpu_place(group_tensor.place())) {
-          // TODO(liuyuhui) support XPU set constant
-          VLOG(3) << "XPU doesn't support set_constant";
-        }
+      if (platform::is_xpu_place(group_tensor.place())) {
+        // TODO(liuyuhui) support XPU set constant
+        VLOG(3) << "XPU doesn't support set_constant";
+      }
 #else
-        auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+      auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+      if (HasGrad(var_index)) {
+        auto var_base = vars_[var_index]->GradVarBase();
+        auto tensor =
+            var_base->MutableVar()->GetMutable<framework::LoDTensor>();
+        TensorCopy(*tensor, place_, *dev_ctx, &group_tensor);
+        group_tensor.Resize({static_cast<int64_t>(length)});
+      } else {
+        group_tensor.Resize({static_cast<int64_t>(length)});
         operators::math::set_constant(*dev_ctx, &group_tensor, 0.0);
-#endif
       }
+#endif
     }
   } else {
     // process sparse group
-    if (is_used_var) {
-      auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar();
-      group.sparse_contents_ = var_warpper->MutableVar();
-    } else {
-      group.sparse_contents_ = nullptr;
-    }
+    PADDLE_ENFORCE_EQ(
+        HasGrad(var_index), true,
+        platform::errors::PreconditionNotMet(
+            "The sparse parameter[%d][%s] should have gradient. "
+            "Currently, DataParallel does not support sparse "
+            "parameters without generating gradients during training. "
+            "For example, if is_sparese=True is used in Embedding, "
+            "the current step of this parameter cannot generate gradient "
+            "because of stop_gradient/detatch, where error will occur.",
+            var_index, vars_[var_index]->Name()));
+    auto var_base = vars_[var_index]->GradVarBase();
+    // need to check tensor type
+    PADDLE_ENFORCE_EQ(
+        var_base->Var().IsType<framework::SelectedRows>(), true,
+        platform::errors::PreconditionNotMet(
+            "The sparse parameter[%d][%s] must have a selectedrows gradient. "
+            "Before forward pass, the parameter type is inferred to be "
+            "SelectedRows, but after backward pass, its actual type becomes "
+            "LodTensor. It is currently not supported by DataParallel. "
+            "For example, if sparse embedding is used, and the weight of "
+            "embedding is shared with subsequent dense parameters, then "
+            "the parameter gradient of the embedding will be converted "
+            "to dense parameters.",
+            var_index, vars_[var_index]->Name()));
+
+    group.sparse_contents_ = var_base->MutableVar();
   }
 
   if (--group.pending_ == 0) {
@@ -641,6 +743,14 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
 // TODO(liuyuhui): If BKCL support non-blocking communication, it should be
 // fixed as same as multi gpus card trainging.
 void Reducer::MarkGroupReady(size_t group_index) {
+  PADDLE_ENFORCE_GE(
+      group_index, next_group_,
+      platform::errors::PreconditionNotMet(
+          "The index of the incoming group must be greater "
+          "than or equal to the previously synchronized group index, "
+          "expect it to greater than or equal to %d, but got %d.",
+          next_group_, group_index));
+
   if (group_index > next_group_) {
     VLOG(3) << "It will adjust the order of group in next batch automatically";
     return;
@@ -649,7 +759,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
   for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0;
        ++next_group_) {
     auto &group = groups_[next_group_];
-    int run_order = next_group_ % nrings_;
+    const int run_order = next_group_ % nrings_;
 
     // For CUDA or XPU, compute_stream --> comm_stream.
     // For CPU, do nothing.
@@ -665,10 +775,11 @@ void Reducer::MarkGroupReady(size_t group_index) {
     // TODO(liuyuhui): Add try catch to deal with exception later,
     // otherwise the main thread will continue to run when an exception is
     // thrown in comm_pool_.
-    comm_pool_->enqueue([&] {
+    auto next_group = next_group_;
+    comm_pool_->enqueue([this, run_order, next_group, &group] {
       auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
       platform::SetXPUDeviceId(dev_id);
-      FusedAllReduceSchedule(run_order, group);
+      FusedAllReduceSchedule(run_order, group, next_group);
       {
         std::lock_guard<std::mutex> lock(mutex_);
         comm_op_count_ -= 1;  // lock
@@ -676,7 +787,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
       }
     });
 #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
-    FusedAllReduceSchedule(run_order, group);
+    FusedAllReduceSchedule(run_order, group, next_group_);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "Not compiled with BKCL or NCCL."));
@@ -684,24 +795,23 @@ void Reducer::MarkGroupReady(size_t group_index) {
   }
 }
 
-void Reducer::FusedAllReduceSchedule(int run_order, Group &group) {
+void Reducer::FusedAllReduceSchedule(const int run_order, Group &group,
+                                     const int curr_group_index) {
+  // The overall timeline: concat > div_nranks > allreduce > split
+  // dev_context is used to select different stream
+  const auto &dev_context = *parallel_ctx_->GetDeviceContext(run_order);
   if (group.is_sparse_) {
-    if (group.sparse_contents_ != nullptr) {
-      VLOG(3) << "sparse group [" << next_group_ << "] start allreduce in ring["
-              << run_order << "]";
-      group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_);
-      parallel_ctx_->AllReduceByStream(
-          *group.sparse_contents_, group.sparse_contents_, run_order, false);
-    } else {
-      VLOG(3) << "The sparse group[" << next_group_
-              << "] has no var to allreduce";
-    }
+    VLOG(3) << "sparse group [" << curr_group_index
+            << "] start allreduce in ring[" << run_order << "]";
+    group.DivNRanks(dev_context, nranks_);
+    parallel_ctx_->AllReduceByStream(*group.sparse_contents_,
+                                     group.sparse_contents_, run_order, false);
   } else {
-    VLOG(3) << "dense group [" << next_group_ << "] start allreduce in ring["
-            << run_order << "]";
+    VLOG(3) << "dense group [" << curr_group_index
+            << "] start allreduce in ring[" << run_order << "]";
     // Select common commstream to concat tensors
     // group.dense_tensors ---> group.dense_contents_
-    group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order));
+    group.ConcatTensors(dev_context);
 
 // NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
 // default stream for communicating, so there exist some problems in
@@ -713,15 +823,15 @@ void Reducer::FusedAllReduceSchedule(int run_order, Group &group) {
       parallel_ctx_->WaitComm(run_order);
     }
 #endif
-    group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_);
 
+    group.DivNRanks(dev_context, nranks_);
     // Start allreduce
     parallel_ctx_->AllReduceByStream(
         group.dense_contents_, &(group.dense_contents_), run_order, false);
 
-    // Select common commstream to split tensors
+    // Select communication stream to split tensors
     // group.dense_contents_ ---> group.dense_tensors
-    group.SplitTensors(*parallel_ctx_->GetDeviceContext(run_order));
+    group.SplitTensors(dev_context);
   }
 }
 
@@ -747,14 +857,98 @@ std::vector<std::vector<size_t>> Reducer::RebuildGruops() {
   return rebuild_group_indices;
 }
 
+void Reducer::ProcessUnusedDenseVars() {
+  // The calculation stream must be used here to
+  // avoid conflicts with communication.
+  VLOG(3) << "Local used vars : "
+          << string::join_strings(local_used_vars_, ',');
+  const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+  // H2D is to allreduce the local_used_vars_
+  auto *global_used_tensor =
+      global_used_vars_.GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector<int>(local_used_vars_, *dev_ctx,
+                                   global_used_tensor);
+  parallel_ctx_->AllReduceByStream(global_used_vars_, &global_used_vars_, 0,
+                                   true);
+  framework::TensorToVector<int>(*global_used_tensor, *dev_ctx,
+                                 &local_used_vars_);
+
+  // sync compute stream to get global used var message,
+  // but maybe affect speed performance
+  parallel_ctx_->SynchronizeCompute();
+  VLOG(3) << "Global used vars : "
+          << string::join_strings(local_used_vars_, ',');
+
+  for (const auto var_index : unused_vars_) {
+    const bool global_unused = (local_used_vars_[var_index] == 0);
+
+    // global used but local unused, set grad
+    VLOG(3) << "Var [" << var_index << "] [" << vars_[var_index]->Name()
+            << "] global_unused:" << global_unused
+            << "  has grad: " << HasGrad(var_index);
+
+    if (!global_unused) {
+      VLOG(3) << "Start process unused Var";
+      // 1. source var base
+      const auto &var_locator = variable_locators_[var_index];
+      const auto group_index = var_locator.group_index;
+      const auto &group = groups_[group_index];
+      const auto inside_group_index = var_locator.inside_group_index;
+      const auto &src_tensor = group.dense_tensors_[inside_group_index];
+      // sparse no need to check and no support find_unused_parameters
+      if (group.is_sparse_) {
+        continue;
+      }
+      // 2. destination var base
+      auto dest_var_base = vars_[var_index];
+      auto *dest_tensor =
+          dest_var_base->MutableVar()->GetMutable<framework::LoDTensor>();
+      const auto &dest_dims = dest_tensor->dims();
+
+      // 3. create grad var base or get grad var base
+      auto grad_var_base_tmp = dest_var_base->MutableGradVarBase();
+
+      // 4. set grad tensor
+      auto *dest_grad_tensor =
+          grad_var_base_tmp->MutableVar()->GetMutable<framework::LoDTensor>();
+      const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+      TensorCopy(src_tensor, place_, *dev_ctx, dest_grad_tensor);
+      dest_grad_tensor->Resize(dest_dims);
+    }
+  }
+}
+
+bool Reducer::HasGrad(size_t var_index) {
+  const auto grad_var = vars_[var_index]->GradVarBase();
+  if (!grad_var || !grad_var->Var().IsInitialized()) {
+    return false;
+  }
+
+  const auto &var = grad_var->Var();
+  if (var.IsType<framework::LoDTensor>()) {
+    if (var.Get<framework::LoDTensor>().IsInitialized()) {
+      return true;
+    }
+  } else if (var.IsType<framework::SelectedRows>()) {
+    if (var.Get<framework::SelectedRows>().value().IsInitialized()) {
+      return true;
+    }
+  } else {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Only support LoDTensor and SelectedRows for gradient var"));
+  }
+  return false;
+}
+
 void Reducer::FinalizeBackward() {
-  all_group_ready_ = false;
+  groups_need_finalize_ = false;
 #ifdef PADDLE_WITH_XPU_BKCL
   {
     std::unique_lock<std::mutex> lock(mutex_);
     cv_.wait(lock, [&] { return comm_op_count_ == 0; });
   }
 #endif
+
   // Must prevent compute_stream_ starting until all comm streams have finished
   for (int i = 0; i < nrings_; ++i) {
     parallel_ctx_->WaitComm(i);
@@ -767,7 +961,18 @@ void Reducer::FinalizeBackward() {
     InitializeGroups(group_indices_);
   }
 
-  VLOG(3) << "In the batch, Reducer is finished...";
+  if (find_unused_vars_each_step_) {
+// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    ProcessUnusedDenseVars();
+#endif
+    // Initialize local used vars
+    local_used_vars_.clear();
+    local_used_vars_.resize(vars_.size(), 0);
+    VLOG(3) << "ProcessUnusedDenseVars is finished.";
+  }
+
+  VLOG(3) << "In the batch, Reducer is finished.";
 }
 
 // According to the size of each parameter, it is allocated to different groups.
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index b2680d0dea71aa19399242bffedc3d7914cebbb9..8392ab2c704d503a622cc09cd5a7efb8ebc680b3 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -27,6 +27,7 @@
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -153,13 +154,23 @@ class Reducer {
 
   void MarkGroupReady(size_t group_index);
 
-  void FusedAllReduceSchedule(int run_order, Group& group);  // NOLINT
+  void FusedAllReduceSchedule(const int run_order, Group& group,  // NOLINT
+                              const int curr_group_index);
 
   void FinalizeBackward();
 
   std::vector<std::vector<size_t>> RebuildGruops();
 
-  inline bool NeedRebuildGroup() { return !has_rebuilt_group_; }
+  inline bool NeedRebuildGroup() {
+    return !has_rebuilt_group_ && !find_unused_vars_each_step_;
+  }
+
+  void ProcessUnusedDenseVars();
+
+  bool HasGrad(size_t var_index);
+
+  void TraverseBackwardGraph(
+      const std::vector<std::shared_ptr<imperative::VarBase>>& outputs);
 
  private:
   std::vector<std::shared_ptr<imperative::VarBase>> vars_;
@@ -187,8 +198,9 @@ class Reducer {
   std::unordered_map<VariableWrapper*, size_t> var_index_map_;
   std::vector<size_t> unused_vars_;
   bool has_marked_unused_vars_{false};
-  bool find_unused_vars_{false};
-  bool all_group_ready_{false};
+  bool find_unused_vars_each_step_{false};
+  bool find_unused_vars_once_{true};
+  bool groups_need_finalize_{false};
 #ifdef PADDLE_WITH_XPU_BKCL
   // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training.
   std::unique_ptr<::ThreadPool> comm_pool_{nullptr};
@@ -196,6 +208,19 @@ class Reducer {
   std::mutex mutex_;
   std::condition_variable cv_;
 #endif
+
+  // it just for checking hook, each parameter can only trigger one hook
+  std::vector<bool> vars_marked_ready_;
+
+  // Following variables are to help control flow.
+  // local_used_vars_ uses 0/1 to indicate whether the
+  // var is used in iteration. After the end of the
+  // iteration, global_used_vars_ is obtained synchronously
+  // globally. Choose whether to update the local
+  // gradient according to the global_used_vars_.
+  std::vector<int> local_used_vars_;
+  // global_used_vars_ is used in comm stream to avoid wait
+  framework::Variable global_used_vars_;
 };
 
 std::vector<std::vector<size_t>> AssignGroupBySize(
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 4967df5341d3559aa9a8d6c57e8d12ba808396e0..2d8a08217b0b83cfc22c250551e9aa81e01e86c0 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -15,6 +15,7 @@
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/imperative/nccl_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
 
 #include "gtest/gtest.h"
 
@@ -36,9 +37,13 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void BcastNCCLId(int local_rank, std::vector<ncclUniqueId>* nccl_ids) {
   auto strategy = GetStrategy(local_rank);
+  int server_fd = platform::CreateListenSocket(strategy.current_endpoint_);
+
   platform::CUDAPlace gpu(local_rank);
   imperative::NCCLParallelContext ctx(strategy, gpu);
-  ctx.BcastNCCLId(*nccl_ids, 0);
+  ctx.BcastNCCLId(*nccl_ids, 0, server_fd);
+
+  platform::CloseSocket(server_fd);
 }
 
 TEST(BcastNCCLId, Run) {
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 7bf5f876681bab001665080f506b087455dfde4b..5c4e1538cf053853d2e9d5dab88419d930b06b63 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -37,6 +37,30 @@ namespace imperative {
 using vb_vector = std::vector<std::shared_ptr<imperative::VarBase>>;
 using var_pair = std::pair<std::string, vb_vector>;
 
+std::shared_ptr<imperative::VariableWrapper> DoubleHook(
+    const std::shared_ptr<imperative::VariableWrapper>& var) {
+  // 1. create out var
+  auto out_var = std::make_shared<imperative::VariableWrapper>(var->Name());
+  out_var->SetType(var->Type());
+  out_var->SetDataType(var->DataType());
+  out_var->SetForwardDataType(var->ForwardDataType());
+  out_var->InnerSetOverridedStopGradient(var->InnerOverridedStopGradient());
+
+  // 2. get input and output var's tensor
+  auto* out_tensor = out_var->MutableVar()->GetMutable<framework::LoDTensor>();
+  auto& tensor = var->Var().Get<framework::LoDTensor>();
+  out_tensor->Resize(tensor.dims());
+
+  // 3. double calc
+  auto* data = tensor.data<float>();
+  auto* out_data = out_tensor->mutable_data<float>(platform::CPUPlace());
+  for (int64_t i = 0; i < out_tensor->numel(); ++i) {
+    out_data[i] = data[i] * 2.0;
+  }
+
+  return out_var;
+}
+
 TEST(TestHooks, TestGradVarLeafBackwardHook) {
   // 1. prepare
   Tracer tracer;
@@ -73,17 +97,14 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   framework::AttributeMap mul_attr_map;
   mul_attr_map["use_mkldnn"] = false;
 
-  // add GradAccumulatorPostHook
-  auto x_var_wrapper = x->SharedVar();
-  x_var_wrapper->AddGradVarLeafBackwardHook(
-      std::unique_ptr<LambdaGradAccumulatorPostHook>(
-          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
-            auto* grad_tensor =
-                grad->MutableVar()->GetMutable<framework::LoDTensor>();
-            for (int i = 0; i < grad_tensor->numel(); ++i) {
-              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
-            }
-          })));
+  // add VariableWrapper hook
+  x->GradVarBase()->AddVariableWrapperHook(
+      std::make_shared<imperative::CppVariableWrapperHook>(DoubleHook));
+
+  // add Void hook
+  int64_t hook_value = 0;
+  x->GradVarBase()->AddVoidHook(
+      std::make_shared<std::function<void()>>([&]() { hook_value = 10; }));
 
   // 2. forward
   tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
@@ -93,16 +114,21 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
+  // verify VariableWrapper hook result
   framework::LoDTensor x_grad;
   framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
                             &x_grad);
   for (int i = 0; i < x_grad.numel(); ++i) {
     ASSERT_EQ(x_grad.data<float>()[i], 8.0);
   }
+  // verify Void hook result
+  ASSERT_EQ(hook_value, 10);
 
   framework::LoDTensor y_grad;
   framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,
@@ -151,17 +177,14 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   memory::Copy(place, mutable_z, place, src_data.data(),
                sizeof(float) * src_data.size());
 
-  // add GradAccumulatorPostHook
-  auto x_var_wrapper = x->SharedVar();
-  x_var_wrapper->AddGradVarLeafBackwardHook(
-      std::unique_ptr<LambdaGradAccumulatorPostHook>(
-          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
-            auto* grad_tensor =
-                grad->MutableVar()->GetMutable<framework::LoDTensor>();
-            for (int i = 0; i < grad_tensor->numel(); ++i) {
-              grad_tensor->mutable_data<float>(place)[i] *= 2.0;
-            }
-          })));
+  // add VariableWrapper hook
+  x->GradVarBase()->AddVariableWrapperHook(
+      std::make_shared<imperative::CppVariableWrapperHook>(DoubleHook));
+
+  // add Void hook
+  int64_t hook_value = 0;
+  x->GradVarBase()->AddVoidHook(
+      std::make_shared<std::function<void()>>([&]() { hook_value = 100; }));
 
   // 2. forward
   var_pair x_pair = var_pair("X", vb_vector(1, x));
@@ -193,16 +216,21 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
+  // verify VariableWrapper hook result
   framework::LoDTensor x_grad;
   framework::TensorCopySync(x->GradVar().Get<framework::LoDTensor>(), place,
                             &x_grad);
   for (int i = 0; i < x_grad.numel(); ++i) {
     ASSERT_EQ(x_grad.data<float>()[i], 16.0);
   }
+  // verify Void hook result
+  ASSERT_EQ(hook_value, 100);
 
   framework::LoDTensor y_grad;
   framework::TensorCopySync(y->GradVar().Get<framework::LoDTensor>(), place,
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 9e3b0ea5df6838477cac579df164bfddf31a176c..76de413b3e6033bf1e6027bbd3bbc210d8a405df 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
                  gpu_place, true);
   imperative::BasicEngine engine;
-  engine.Init(reduce_sum_out.get());
+
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{reduce_sum_out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor rlt;
@@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{vout};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   imperative::BasicEngine engine;
-  engine.Init(vout.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   // check the grad
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index c9567a98a73bf213c70bbfbd2c9a39b5efb38d2e..0c2fd2da7e0ba6ed13ad993f30ed8650e6fc15a2 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/op_base.h"
+#include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -38,7 +39,7 @@ void SetCurrentTracer(const std::shared_ptr<Tracer>& tracer) {
   VLOG(6) << "Set current tracer: " << g_current_tracer;
 }
 
-static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
+void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
   for (const auto& pair : outs) {
     for (const auto& var : pair.second) {
       // NOTE(zhiqiu): this happends when None output are passed from python
@@ -83,7 +84,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
   if (gcs_.count(place) == 0) {
     std::unique_ptr<framework::GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gc.reset(new framework::DefaultStreamGarbageCollector(
           BOOST_GET_CONST(platform::CUDAPlace, place), 0));
 
@@ -94,7 +95,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
           "Please recompile or reinstall Paddle with GPU support."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gc.reset(new framework::CUDAPinnedGarbageCollector(
           BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0));
 
@@ -134,6 +135,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
                      const platform::Place& place, bool trace_backward,
                      const std::map<std::string, std::string>& inplace_map) {
   platform::RecordEvent op_type_record_event(type);
+  platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
     // if both lists are empty all ops are enabled (default for
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index b10d1b2d0b49dac42f80e9462fb35ee58312b8b0..8f50550878262f1d37c34923e4c8bc55460b08d6 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -130,5 +130,7 @@ void IncreaseVarbaseReferenceCountUntilCopyComplete(
     const std::shared_ptr<imperative::VarBase>& var,
     const platform::Place& place);
 
+void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad);
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index b42f25dcc880015b9e191970f84e970f329bb0ee..5fa8b89a396d9bd509de375471b8d383b9b91874 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -27,8 +27,8 @@
 namespace paddle {
 namespace imperative {
 
-class InteriorVarHookPipeline;
-class LeafVarHookPipeline;
+class VariableWrapperHook;
+class InplaceVariableWrapperHook;
 class VarBase;
 class GradOpNode;
 
@@ -38,6 +38,9 @@ class VariableWrapper {
 
   explicit VariableWrapper(const std::string& name) : name_(name) {}
 
+  VariableWrapper(const std::string& name, const framework::Variable& variable)
+      : var_(variable), name_(name) {}
+
   ~VariableWrapper() { VLOG(10) << "Destruct VariableWrapper: " << Name(); }
 
   const framework::Variable& Var() const { return var_; }
@@ -193,42 +196,6 @@ class VariableWrapper {
     }
   }
 
-  /* Hook related method: only can be call by GradVarBase */
-
-  bool HasInteriorHooks() const { return interior_hooks_ != nullptr; }
-
-  bool HasLeafHooks() const { return leaf_hooks_ != nullptr; }
-
-  void AddGradVarInteriorHook(std::unique_ptr<OpBasePreHook>&& hook) {
-    auto interior_hooks = GetGradVarInteriorHooksSafely();
-    interior_hooks->add_hook(std::move(hook));
-  }
-
-  void AddGradVarLeafHook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    auto leaf_hooks = GetGradVarLeafHooksSafely();
-    leaf_hooks->add_hook(std::move(hook));
-  }
-
-  void AddGradVarLeafBackwardHook(
-      std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    auto leaf_hooks = GetGradVarLeafHooksSafely();
-    leaf_hooks->add_backward_hook(std::move(hook));
-  }
-
-  const std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() const {
-    return interior_hooks_;
-  }
-
-  std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() {
-    return interior_hooks_;
-  }
-
-  const std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() const {
-    return leaf_hooks_;
-  }
-
-  std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() { return leaf_hooks_; }
-
   uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; }
 
   void ResetInplaceVersion() {
@@ -255,6 +222,38 @@ class VariableWrapper {
     return;
   }
 
+  /* Hook related methods */
+  bool HasVariableWrapperHook() const { return !var_hooks_.empty(); }
+
+  int64_t AddVariableWrapperHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    var_hooks_.emplace(next_hook_id_, std::move(hook));
+    return next_hook_id_++;
+  }
+
+  bool RemoveVariableWrapperHook(const int64_t& hook_id) {
+    auto remove_cnt = var_hooks_.erase(hook_id);
+    if (remove_cnt == 0) {
+      return false;
+    }
+    return true;
+  }
+
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>&
+  GetVariableWrapperHooks() const {
+    return var_hooks_;
+  }
+
+  bool HasVoidHook() const { return !void_hooks_.empty(); }
+
+  void AddVoidHook(std::shared_ptr<std::function<void()>>&& hook) {
+    void_hooks_.emplace_back(std::move(hook));
+  }
+
+  const std::vector<std::shared_ptr<std::function<void()>>>& GetVoidHooks()
+      const {
+    return void_hooks_;
+  }
+
  private:
   void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
     auto shared_var = grad_var_.lock();
@@ -289,41 +288,6 @@ class VariableWrapper {
     }
   }
 
-  /* Hook related private methods */
-  std::shared_ptr<VariableWrapper> GetGradVarSafely() const {
-    auto shared_grad_var = grad_var_.lock();
-    PADDLE_ENFORCE_NOT_NULL(
-        shared_grad_var,
-        platform::errors::PermissionDenied(
-            "Cannot add gradient hook on Tensor without gradient."));
-    return shared_grad_var;
-  }
-
-  std::shared_ptr<InteriorVarHookPipeline>& GetGradVarInteriorHooksSafely() {
-    auto shared_grad_var = GetGradVarSafely();
-    PADDLE_ENFORCE_EQ(HasGradNode(), true,
-                      platform::errors::PermissionDenied(
-                          "Only interior Tensor in backward can register "
-                          "interior gradient hook."));
-    if (shared_grad_var->interior_hooks_ == nullptr) {
-      shared_grad_var->interior_hooks_ =
-          std::make_shared<InteriorVarHookPipeline>();
-    }
-    return shared_grad_var->interior_hooks_;
-  }
-
-  std::shared_ptr<LeafVarHookPipeline>& GetGradVarLeafHooksSafely() {
-    auto shared_grad_var = GetGradVarSafely();
-    PADDLE_ENFORCE_EQ(
-        HasGradNode(), false,
-        platform::errors::PermissionDenied(
-            "Only leaf Tensor in backward can register leaf gradient hook."));
-    if (shared_grad_var->leaf_hooks_ == nullptr) {
-      shared_grad_var->leaf_hooks_ = std::make_shared<LeafVarHookPipeline>();
-    }
-    return shared_grad_var->leaf_hooks_;
-  }
-
  private:
   framework::Variable var_;
   std::string name_;
@@ -358,11 +322,19 @@ class VariableWrapper {
   // isn't need
   bool is_empty_{false};
 
-  // NOTE: only grad var can hold hooks now
-  // only interior var can hold interior hooks
-  std::shared_ptr<InteriorVarHookPipeline> interior_hooks_;
-  // only leaf var can hold leaf hooks
-  std::shared_ptr<LeafVarHookPipeline> leaf_hooks_;
+  // NOTE(chenweihang): only grad var will hold hooks now
+  int64_t next_hook_id_{0};
+  // [ Hooks with VariableWrapper as input and output ]
+  // NOTE: Now registered for grad var, support adding and removing,
+  // key is the accumulated int64_t value
+  // NOTE: Var hook need to support removing, so need hook id
+  std::map<int64_t, std::shared_ptr<VariableWrapperHook>> var_hooks_;
+  // [ Hooks without input and output ]
+  // NOTE: Now registered after the execution of the entire backward
+  // process is over, currently only used for reducing in distributed
+  // training
+  // NOTE: Now no need to support remove void hook
+  std::vector<std::shared_ptr<std::function<void()>>> void_hooks_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 7a8bfc1a8c7008f5b16a2fca6692600b39690e59..c002c7a10cb7b3e953a4e2551e54b20998dea400 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -33,9 +33,13 @@ if (WITH_LITE)
   add_subdirectory(lite)
 endif()
 
-# fluid_modules exclude API-interface of inference/api and inference/capi
+# fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
+# Adapt to custom op mechanism: Include the header files related to the data type
+# to avoid exposing the path of the underlying file
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+
 add_subdirectory(api)
 
 # Create static inference library if needed
@@ -57,7 +61,7 @@ if(NOT APPLE)
 endif()
 
 # C inference API
-add_subdirectory(capi)
+add_subdirectory(capi_exp)
 
 if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     add_subdirectory(tests/api)
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index d5a972fab3beae4d4e2e512d1ccda3f0b8356682..14a1c3eea3417432c76ce03d41b558577d2aa037 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <iosfwd>
 #include <string>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/helper.h"
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index bd27b1f5f34475db793d643f2d12508e0aea631e..255c6ca75dfd74b3cf5984661ea931d36295f72a 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -213,6 +213,11 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
 
+  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
+  DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);
+
   DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter,
                       std::vector<std::string>);
   DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
@@ -222,6 +227,11 @@ struct Argument {
 
   DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
   DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
+  DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
+  DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
+  DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
+  DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc
index 368ef2e5583fe2f6fcb24c98ded02f4e5325f7a4..ede0402f816765f7079eab91170e2d9d5905e915 100644
--- a/paddle/fluid/inference/analysis/helper.cc
+++ b/paddle/fluid/inference/analysis/helper.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/helper.h"
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index ab4949935140c682c7dee355a0e4ed9c2b3a1f5a..cace420d87c9df54387c27cecc58705c19ce5336 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -25,7 +25,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index a4e263e2f464c4021b049093c49ddaecb056284f..4bb08dc96b1cf529c1b433092f3b9e51d03aa7e9 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -106,8 +106,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool use_static_engine = argument->tensorrt_use_static_engine();
       bool model_from_memory = argument->model_from_memory();
       std::string optim_cache_dir = argument->optim_cache_dir();
-      bool int8_valid =
-          !(model_from_memory && optim_cache_dir.empty() && enable_int8);
+      bool int8_valid = !(model_from_memory && optim_cache_dir.empty() &&
+                          enable_int8 && use_calib_mode);
       PADDLE_ENFORCE_EQ(
           int8_valid, true,
           platform::errors::PreconditionNotMet(
@@ -166,6 +166,11 @@ void IRPassManager::CreatePasses(Argument *argument,
       // run fp16.
       pass->Set("disable_trt_plugin_fp16",
                 new bool(argument->disable_trt_plugin_fp16()));
+    } else if (pass_name == "dlnne_subgraph_pass") {
+      pass->Set("min_subgraph_size",
+                new int(argument->dlnne_min_subgraph_size()));
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool enable_int8 =
@@ -183,6 +188,12 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->xpu_l3_workspace_size()));
       pass->Set("cpu_math_library_num_threads",
                 new int(argument->cpu_math_library_num_threads()));
+      pass->Set("locked", new bool(argument->xpu_locked()));
+      pass->Set("autotune", new bool(argument->xpu_autotune()));
+      pass->Set("autotune_file",
+                new std::string(argument->xpu_autotune_file()));
+      pass->Set("precision", new std::string(argument->xpu_precision()));
+      pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index e35178428cc7bae7f5795e2a4652b808956f6776..330f7a99847344f7359a29e26efac71e969bf06d 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -20,3 +20,15 @@ if (WITH_LITE)
   set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "")
   cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog)
 endif()
+
+MESSAGE("WITH_DLNNE:${WITH_DLNNE}")
+if(WITH_DLNNE)
+  cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util)
+  set(analysis_deps ${analysis_deps}
+        subgraph_util dlnne_subgraph_pass
+        CACHE INTERNAL "")
+
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
+  file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "")
+endif()
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
similarity index 64%
rename from paddle/fluid/operators/distributed/large_scale_kv.cc
rename to paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
index d2673ed6ffb3667eed2a4599ae462587c18431b0..ae977c1403a8793b0611496702515f1df952d5a1 100644
--- a/paddle/fluid/operators/distributed/large_scale_kv.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -11,16 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
+#pragma once
 
 namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag LargeScaleKV::init_flag_;
-std::shared_ptr<LargeScaleKV> LargeScaleKV::scale_kv_(nullptr);
+namespace inference {
 
-}  // namespace distributed
-}  // namespace operators
+int RegisterPyFunc(const std::string& name, void* pfn);
+}  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f789139af9bfc35841f284d043a2c86f5803e93
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
@@ -0,0 +1,351 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <map>
+#include <set>
+
+#include <fstream>
+#include <iostream>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/subgraph_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h"
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+
+int (*PyConvertGraph)(const char *graph_name);
+
+int RegisterPyFunc(const std::string &name, void *pfn) {
+  if (name.compare("convert_graph") == 0) {
+    PyConvertGraph = reinterpret_cast<decltype(PyConvertGraph)>(pfn);
+  }
+
+  return 0;
+}
+int ConvertGraph(std::string graph_name) {
+  LOG(INFO) << "starting doing convert_graph";
+
+  PyConvertGraph(graph_name.c_str());
+
+  return 0;
+}
+
+namespace analysis {
+
+using framework::ir::Node;
+
+void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const {
+  static std::unordered_set<std::string> teller_set{
+      "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+      "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+      "elementwise_add", "elementwise_mul", "dropout", "prelu",
+      "conv2d_transpose", "leaky_relu",
+      // "fc",
+      "shuffle_channel", "swish", "split",
+      // "instance_norm",
+      "gelu",
+      // "layer_norm",
+      // "scale",
+      // "stack",
+      "relu6", "reshape2", "transpose2", "concat", "slice",
+  };
+
+  framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph);
+
+  auto teller = [&](const framework::ir::Node *node) {
+    if (!node->IsOp() || !node->Op()) return false;
+    return teller_set.find(node->Op()->Type()) != teller_set.end();
+  };
+
+  framework::ir::SubGraphFuser fuser(
+      graph, teller, Get<int>("min_subgraph_size") /*min subgraph size*/,
+      "dlnne_engine");
+  fuser();
+
+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in dlnne, and should not have another copy in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
+      CreateDlnneOp(node, graph, graph_param_names, &repetitive_params);
+
+      std::unordered_set<const Node *> nodes2remove(
+          framework::ir::Agent(node).subgraph()->begin(),
+          framework::ir::Agent(node).subgraph()->end());
+      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+    }
+  }
+
+  std::unordered_set<const Node *> nodes2remove;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && framework::ir::Agent(node).deleted()) {
+      nodes2remove.insert(node);
+    }
+  }
+  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+}
+
+std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
+                              const std::set<std::string> &engine_outputs,
+                              const std::string &predictor_id) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  engine_hash_key += predictor_id;
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+std::string replace_name(std::string name, const char *raw,
+                         const char *new_char) {
+  std::string r_name = name;
+  int pos = r_name.find(raw);
+  while (pos >= 0) {
+    r_name = r_name.replace(pos, 1, new_char);
+    pos = r_name.find(raw);
+  }
+  return r_name;
+}
+
+void DlnneSubgraphPass::CreateDlnneOp(
+    framework::ir::Node *node, framework::ir::Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
+  auto *op_desc = node->Op();
+  auto &subgraph = *framework::ir::Agent(node).subgraph();
+  PADDLE_ENFORCE_EQ(subgraph.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The subgraph should not be empty."));
+
+  // A fake block desc.
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
+  // for debug
+  framework::ProgramDesc tmp_dump_program_desc;
+  auto *tmp_dump_main_block = tmp_dump_program_desc.MutableBlock(0);
+
+  std::unordered_map<std::string, framework::VarDesc *> name_var_desc;
+  std::set<std::string> name_var_input_nodes;
+  std::set<std::string> name_var_output_nodes;
+  std::set<std::string> name_ops;
+
+  for (auto *node : subgraph) {
+    auto *op = block_desc.AppendOp();
+    *op->Proto() = *node->Op()->Proto();
+
+    // debug
+    {
+      name_ops.insert(node->Name());
+      auto *tmp_dump_new_block_op = tmp_dump_main_block->AppendOp();
+
+      framework::OpDesc op_desc;
+      op_desc.CopyFrom(*node->Op());
+
+      for (auto argument_name : op_desc.InputArgumentNames()) {
+        if (std::count(graph_params.begin(), graph_params.end(),
+                       argument_name) > 0) {
+          op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
+        }
+      }
+      for (auto argument_name : op_desc.OutputArgumentNames()) {
+        if (std::count(graph_params.begin(), graph_params.end(),
+                       argument_name) > 0) {
+          op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
+        }
+      }
+      *tmp_dump_new_block_op->Proto() = *op_desc.Proto();
+
+      for (auto *x : node->inputs) {
+        if (x->IsVar()) {
+          name_var_desc[x->Name()] = x->Var();
+        }
+        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
+            0)
+          name_var_input_nodes.insert(x->Name());
+      }
+
+      for (auto *x : node->outputs) {
+        if (x->IsVar()) {
+          name_var_desc[x->Name()] = x->Var();
+        }
+        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
+            0)
+          name_var_output_nodes.insert(x->Name());
+      }
+    }
+  }
+  std::set<std::string> valid_input_names;
+  std::set<std::string> valid_output_names;
+  for (auto name : name_var_output_nodes) {
+    if (name_var_input_nodes.find(name) == name_var_input_nodes.end()) {
+      valid_output_names.insert(name);
+    }
+  }
+
+  for (auto name : name_var_input_nodes) {
+    if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+      valid_input_names.insert(name);
+    }
+  }
+
+  // Then, we will use the input_names_with_id and output_names_with_id to
+  // generate the engine key.
+  // So, We use set instead of unordered_set here to ensure that the engine key
+  // is unique.
+  std::set<std::string> input_names;
+  std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+  // if we delete fluid copy of params shared by more than 1 ops, there will be
+  // problem, so we filter them out.
+
+  // The node->inputs contains input tensors and parameters.
+  for (auto *x : node->inputs) {
+    input_names.insert(x->Name());
+    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
+  }
+
+  std::set<std::string> output_names;
+  std::set<std::string> output_names_with_id;
+  std::vector<int> origin_output_dims;
+  for (auto *x : node->outputs) {
+    origin_output_dims.push_back(x->Var()->GetShape().size());
+    output_names.insert(x->Name());
+    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+
+  std::unordered_map<std::string, std::string> output_name_map;
+  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
+
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      graph_var_map[node->Name()] = node;
+    }
+  }
+
+  // Set attrs
+  op_desc->SetType("dlnne_engine");
+  op_desc->SetInput("Xs", std::vector<std::string>(valid_input_names.begin(),
+                                                   valid_input_names.end()));
+
+  op_desc->SetOutput("Ys", std::vector<std::string>(valid_output_names.begin(),
+                                                    valid_output_names.end()));
+
+  op_desc->SetAttr("parameters", params);
+  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
+                                      std::to_string(0));
+  op_desc->SetAttr("engine_key", engine_key);
+  auto *scope = param_scope();
+
+  {
+    std::set<std::string> input_names;
+
+    for (auto name : name_var_input_nodes) {
+      if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+        input_names.insert(name);
+      }
+    }
+
+    // add feed to subgraph:
+    int input_idx = 0;
+    for (auto input_name : input_names) {
+      auto *feed0 = tmp_dump_main_block->AppendOp();
+      feed0->SetType("feed");
+      feed0->SetInput("X", {"feed"});
+      feed0->SetOutput("Out", {input_name});
+      feed0->SetAttr("col", input_idx);
+      input_idx++;
+    }
+    // add fetch to subgraph:
+    int output_idx = 0;
+    for (auto output_name : valid_output_names) {
+      auto *fetch0 = tmp_dump_main_block->AppendOp();
+      fetch0->SetType("fetch");
+      fetch0->SetInput("X", {output_name});
+      fetch0->SetOutput("Out", {"out"});
+      fetch0->SetAttr("col", output_idx);
+      output_idx++;
+    }
+
+    mkdir("./dump", 0777);
+    std::string dir_name = "./dump/" + engine_key;
+    mkdir(dir_name.c_str(), 0777);
+    ofstream m_stream;
+    m_stream.open(dir_name + "/__model__", ios::out);
+
+    VLOG(4) << "name_var_desc size:" << name_var_desc.size();
+
+    for (auto &kv : name_var_desc) {
+      auto *new_add_var = tmp_dump_main_block->Proto()->add_vars();
+      *new_add_var = *kv.second->Proto();
+      auto *variable_tmp = scope->FindVar(kv.first);
+      if (variable_tmp != nullptr) {
+        *new_add_var->mutable_name() = replace_name(kv.first, "/", ".");
+        new_add_var->set_persistable(true);
+      } else {
+        new_add_var->set_persistable(false);
+      }
+    }
+
+    for (auto param_name : params) {
+      auto *var = scope->FindVar(param_name);
+      if (var != nullptr) {
+        auto *var_t = var->GetMutable<framework::LoDTensor>();
+        ofstream p_stream;
+        p_stream.open(dir_name + "/" + replace_name(param_name, "/", "."),
+                      ios::out);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(var_t->place());
+        framework::SerializeToStream(p_stream, *var_t, dev_ctx);
+        p_stream.close();
+      }
+    }
+
+    std::string model;
+
+    tmp_dump_program_desc.Proto()->SerializeToString(&model);
+    m_stream << model;
+    m_stream.close();
+
+    op_desc->SetBlockAttr("sub_block", tmp_dump_main_block);
+    op_desc->SetAttr("subgraph", model);
+    op_desc->Flush();
+
+    ConvertGraph(engine_key);
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_PASS(dlnne_subgraph_pass,
+              paddle::inference::analysis::DlnneSubgraphPass);
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a1d2506fdb09b5ecb63f8f922490eb4c8c01e2d
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+
+int ConvertGraph(std::string graph_name);
+
+namespace analysis {
+
+class DlnneSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  void ApplyImpl(framework::ir::Graph *graph) const override;
+
+ private:
+  void CleanIntermediateOutputs(framework::ir::Node *node);
+  void CreateDlnneOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                     const std::vector<std::string> &graph_params,
+                     std::vector<std::string> *repetitive_params) const;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index c697914904b3e949050d6e61a7edb521ad6dd0e5..b8cac8992f4eed36b653b08febe48630c3977652 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -245,6 +245,11 @@ void LiteSubgraphPass::SetUpEngine(
   bool use_xpu = Get<bool>("use_xpu");
   int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
   int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
+  bool locked = Get<bool>("locked");
+  bool autotune = Get<bool>("autotune");
+  std::string autotune_file = Get<std::string>("autotune_file");
+  std::string precision = Get<std::string>("precision");
+  bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
 
   lite_api::TargetType target_type;
   if (use_gpu) {
@@ -282,6 +287,11 @@ void LiteSubgraphPass::SetUpEngine(
   };
   config.cpu_math_library_num_threads = cpu_math_library_num_threads;
   config.xpu_l3_workspace_size = xpu_l3_workspace_size;
+  config.locked = locked;
+  config.autotune = autotune;
+  config.autotune_file = autotune_file;
+  config.precision = precision;
+  config.adaptive_seqlen = adaptive_seqlen;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
     lite::StrToBinaryFile("./param.bin", config.param);
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 8a14e168ca4f7ebce3f7fb619655121da38c7581..f57f07883dcd701470457a3e32e14b1ae0493ea3 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 
 namespace paddle {
@@ -86,7 +87,7 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
                               const std::string &predictor_id,
                               const std::string &max_batch_size,
                               const std::string &precision,
-                              const std::string &use_calib_mode) {
+                              const bool for_calibration) {
   std::string engine_hash_key = "";
   for (auto name : engine_inputs) {
     engine_hash_key += name;
@@ -97,12 +98,13 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
     engine_hash_key += "#";
   }
   engine_hash_key += predictor_id;
-  engine_hash_key += "#";
-  engine_hash_key += max_batch_size;
+  if (!for_calibration) {
+    engine_hash_key += "#";
+    engine_hash_key += max_batch_size;
+  }
   engine_hash_key += "#";
   engine_hash_key += precision;
-  engine_hash_key += "#";
-  engine_hash_key += use_calib_mode;
+
   auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
   VLOG(2) << "TRT engine hash key: " << engine_hash_key;
   VLOG(2) << "TRT engine key: " << engine_key;
@@ -167,11 +169,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   std::set<std::string> output_names;
   std::set<std::string> output_names_with_id;
-  std::vector<int> origin_output_dims;
+  std::map<std::string, int> origin_name_output_dims;
   for (auto *x : node->outputs) {
     output_names.insert(x->Name());
     output_names_with_id.insert(x->Name() + std::to_string(x->id()));
-    origin_output_dims.push_back(x->Var()->GetShape().size());
+    origin_name_output_dims[x->Name()] = x->Var()->GetShape().size();
   }
 
   std::unordered_map<std::string, std::string> output_name_map;
@@ -215,11 +217,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // output_mapping help us copy the data from the renamed ITensor
   // to Tensor.
   std::vector<std::string> output_mapping;
+  std::vector<int> renamed_output_dims;
   for (auto name : output_names) {
     PADDLE_ENFORCE_NE(output_name_map.count(name), 0,
                       platform::errors::PreconditionNotMet(
                           "The output_name_map should have %s", name));
     output_mapping.push_back(output_name_map[name]);
+    renamed_output_dims.push_back(origin_name_output_dims[name]);
   }
   PADDLE_ENFORCE_EQ(output_mapping.empty(), false,
                     platform::errors::PreconditionNotMet(
@@ -242,7 +246,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
   op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
   op_desc->SetAttr("output_name_mapping", output_mapping);
-  op_desc->SetAttr("origin_output_dims", origin_output_dims);
+  op_desc->SetAttr("origin_output_dims", renamed_output_dims);
   op_desc->SetAttr("parameters", params);
 
   // we record all inputs' shapes in attr to check if they are consistent
@@ -258,24 +262,31 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // TODO(NHZlX)
   // There are models with the same structure but the different parameters,
   // when running in the 'use_serialize' mode, there is a bug.
+  // serialization is affected by max_batch_size, but calibration is not.
+  // So we use seperate engine keys in serialization and calibration.
   auto engine_key = GenerateEngineKey(
       input_names_with_id, output_names_with_id, std::to_string(0),
       std::to_string(Get<int>("max_batch_size")),
-      std::to_string(static_cast<int>(precision_mode)),
-      std::to_string(static_cast<int>(use_calib_mode)));
+      std::to_string(static_cast<int>(precision_mode)), false);
+  auto calibration_engine_key = GenerateEngineKey(
+      input_names_with_id, output_names_with_id, std::to_string(0),
+      std::to_string(Get<int>("max_batch_size")),
+      std::to_string(static_cast<int>(precision_mode)), true);
   auto predictor_id = Get<int>("predictor_id");
 
   // Get "" when there is no cached calibration table data.
   std::string calibration_data = "";
   if (enable_int8 && use_calib_mode) {
-    calibration_data = GetTrtCalibTableData(
-        Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
+    calibration_data =
+        GetTrtCalibTableData(Get<std::string>("model_opt_cache_dir"),
+                             calibration_engine_key, enable_int8);
   }
   op_desc->SetAttr("calibration_data", calibration_data);
   op_desc->SetAttr("enable_int8", enable_int8);
   op_desc->SetAttr("enable_fp16", enable_fp16);
   op_desc->SetAttr("use_calib_mode", use_calib_mode);
   op_desc->SetAttr("engine_key", engine_key);
+  op_desc->SetAttr("calibration_engine_key", calibration_engine_key);
   op_desc->SetAttr("predictor_id", predictor_id);
 
   std::string trt_engine_serialized_data = "";
@@ -311,11 +322,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     opt_input_shape = {};
   }
 
-  if (min_input_shape.size() > 0 && TRT_VERSION > 6000) {
+  auto to_major_version = [&](int full_version) -> float {
+    return (full_version / 100) / 10.0;
+  };
+  const float compile_time_trt_version = to_major_version(TRT_VERSION);
+  const float run_time_trt_version =
+      to_major_version(tensorrt::GetInferLibVersion());
+  if (compile_time_trt_version != run_time_trt_version) {
     LOG_FIRST_N(WARNING, 1)
-        << "The Paddle lib links the " << TRT_VERSION << " version TensorRT, "
-        << "make sure the runtime TensorRT you are using is no less than this "
-           "version, otherwise, there might be Segfault!";
+        << "The Paddle Inference library is compiled with "
+        << compile_time_trt_version << " version TensorRT, "
+        << "but the runtime TensorRT you are using is " << run_time_trt_version
+        << " version. "
+           "This might cause serious compatibility issues. We strongly "
+           "recommend using the same TRT version at runtime.";
   }
 
   // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index 5e6960c4c7e8c052fb9572f0fd6bcba24a7713b4..fdfd2c60af0c16404953e8639385e539dc13c9b3 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -103,6 +103,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
                                         "merge_lod_tensor",
                                         "equal",
                                         "sequence_pool",
+                                        "recurrent",
                                         "lod_reset"};
     for (auto* tmp : node->inputs) {
       CHECK(tmp->IsOp());
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 9a4637306bb359a71784b9affbc0004d36d95c05..82c95ba2c95712d2ebe3aa80286689028febf3fe 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -32,10 +32,10 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
 if(WITH_CRYPTO)
     cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto)
+              analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
 else()
     cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto)
+              analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
 endif()
 
 if(WIN32)
@@ -57,11 +57,9 @@ if(WITH_TESTING)
   if (NOT APPLE AND NOT WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared
       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
-    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
   elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
-    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
   endif()
 
 endif()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 0622fb27d9e38c87a98fcb86da64bdb21570e67d..853c1ac1da8742733e609c1dea098a208eadc015 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -26,6 +26,7 @@ namespace paddle {
 struct MkldnnQuantizerConfig;
 
 extern const std::vector<std::string> kTRTSubgraphPasses;
+extern const std::vector<std::string> kDlnneSubgraphPasses;
 extern const std::vector<std::string> kLiteSubgraphPasses;
 
 PassStrategy *AnalysisConfig::pass_builder() const {
@@ -95,9 +96,17 @@ void AnalysisConfig::DisableFCPadding() {
   Update();
 }
 
-void AnalysisConfig::EnableXpu(int l3_workspace_size) {
+void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked,
+                               bool autotune, const std::string &autotune_file,
+                               const std::string &precision,
+                               bool adaptive_seqlen) {
   use_xpu_ = true;
   xpu_l3_workspace_size_ = l3_workspace_size;
+  xpu_locked_ = locked;
+  xpu_autotune_ = autotune;
+  xpu_autotune_file_ = autotune_file;
+  xpu_precision_ = precision;
+  xpu_adaptive_seqlen_ = adaptive_seqlen;
   Update();
 }
 
@@ -134,6 +143,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
   CP_MEMBER(trt_use_oss_);
+  // Dlnne related
+  CP_MEMBER(use_dlnne_);
+  CP_MEMBER(dlnne_min_subgraph_size_);
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -157,6 +169,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   CP_MEMBER(use_xpu_);
   CP_MEMBER(xpu_l3_workspace_size_);
+  CP_MEMBER(xpu_locked_);
+  CP_MEMBER(xpu_autotune_);
+  CP_MEMBER(xpu_autotune_file_);
+  CP_MEMBER(xpu_precision_);
+  CP_MEMBER(xpu_adaptive_seqlen_);
 
   // profile related.
   CP_MEMBER(with_profile_);
@@ -211,6 +228,21 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
       pass_builder_->DeletePass(ps);
     }
   }
+  if (use_dlnne_) {
+    auto all_passes = kDlnneSubgraphPasses;
+    auto other_passes = other.pass_builder()->AllPasses();
+    // We should sort them, because the user may call the SwitchIrDebug
+    // interface, which will change the pass.
+    std::sort(all_passes.begin(), all_passes.end());
+    std::sort(other_passes.begin(), other_passes.end());
+    std::vector<std::string> deleted_passes;
+    std::set_difference(all_passes.begin(), all_passes.end(),
+                        other_passes.begin(), other_passes.end(),
+                        std::inserter(deleted_passes, deleted_passes.begin()));
+    for (auto ps : deleted_passes) {
+      pass_builder_->DeletePass(ps);
+    }
+  }
 }
 
 void AnalysisConfig::EnableCUDNN() {
@@ -309,6 +341,12 @@ void AnalysisConfig::EnableTensorRtEngine(
 #endif
 }
 
+void AnalysisConfig::EnableDlnne(int min_subgraph_size) {
+  use_dlnne_ = true;
+  dlnne_min_subgraph_size_ = min_subgraph_size;
+  Update();
+}
+
 void AnalysisConfig::SetTRTDynamicShapeInfo(
     std::map<std::string, std::vector<int>> min_input_shape,
     std::map<std::string, std::vector<int>> max_input_shape,
@@ -383,6 +421,14 @@ void AnalysisConfig::Update() {
       pass_builder()->AppendPass(pass);
     }
   }
+  LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl;
+  if (use_dlnne_) {
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kDlnneSubgraphPasses) {
+      pass_builder()->AppendPass(pass);
+    }
+  }
+
   if (use_gpu() && use_cudnn_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (!enable_ir_optim_) {
@@ -479,6 +525,9 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << tensorrt_max_batchsize_;
   ss << tensorrt_min_subgraph_size_;
 
+  ss << use_dlnne_;
+  ss << dlnne_min_subgraph_size_;
+
   for (auto &op : trt_disabled_ops_) ss << op.c_str();
   ss << ";";
 
@@ -512,6 +561,11 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << use_lite_;
   ss << use_xpu_;
   ss << xpu_l3_workspace_size_;
+  ss << xpu_locked_;
+  ss << xpu_autotune_;
+  ss << xpu_autotune_file_;
+  ss << xpu_precision_;
+  ss << xpu_adaptive_seqlen_;
 
   ss << thread_local_stream_;
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2a1dacedca8f1b81d155841561bd5d5a16ca9344..89c8c7902bac9fd2e15a164f7e0dfd21945cf16e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -191,22 +191,8 @@ bool AnalysisPredictor::PrepareScope(
     status_is_cloned_ = true;
   } else {
     paddle::framework::InitDevices();
-    scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) {
-      delete scope;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      for (int dev_id = 0; dev_id < paddle::platform::GetCUDADeviceCount();
-           ++dev_id) {
-        memory::Release(platform::CUDAPlace(dev_id));
-      }
-#endif
-#ifdef PADDLE_WITH_XPU
-      for (int dev_id = 0; dev_id < paddle::platform::GetXPUDeviceCount();
-           ++dev_id) {
-        memory::Release(platform::XPUPlace(dev_id));
-      }
-#endif
-      memory::Release(platform::CPUPlace());
-    });
+    // TODO(wilber): we need to release memory occupied by weights.
+    scope_.reset(new paddle::framework::Scope());
     status_is_cloned_ = false;
   }
   sub_scope_ = &scope_->NewScope();
@@ -537,6 +523,12 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
   }
 
+  if (config_.dlnne_enabled()) {
+    LOG(INFO) << "Dlnne subgraph is enabled";
+    argument_.SetUseDlnne(true);
+    argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
+  }
+
   if (config_.lite_engine_enabled()) {
     argument_.SetCpuMathLibraryNumThreads(
         config_.cpu_math_library_num_threads());
@@ -546,6 +538,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetLiteZeroCopy(config_.lite_zero_copy_);
     argument_.SetUseXpu(config_.use_xpu_);
     argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
+    argument_.SetXpuLocked(config_.xpu_locked_);
+    argument_.SetXpuAutotune(config_.xpu_autotune_);
+    argument_.SetXpuAutotuneFile(config_.xpu_autotune_file_);
+    argument_.SetXpuPrecision(config_.xpu_precision_);
+    argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
@@ -617,7 +614,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   // This function can only be executed once per process.
   static std::once_flag custom_operators_registered;
   std::call_once(custom_operators_registered,
-                 []() { paddle::RegisterAllCustomOperator(); });
+                 []() { inference::RegisterAllCustomOperator(); });
 
   if (config.use_gpu()) {
     static std::once_flag gflags_initialized;
@@ -1017,8 +1014,8 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() {
   auto &block = inference_program_->Block(0);
   for (auto &op_desc : block.AllOps()) {
     if (op_desc->Type() == "tensorrt_engine") {
-      std::string engine_name =
-          BOOST_GET_CONST(std::string, op_desc->GetAttr("engine_key"));
+      std::string engine_name = BOOST_GET_CONST(
+          std::string, op_desc->GetAttr("calibration_engine_key"));
       if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_name)) {
         LOG(ERROR) << "You should run the predictor(with trt) on the real data "
                       "to generate calibration info";
@@ -1191,6 +1188,13 @@ USE_TRT_CONVERTER(slice);
 USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 USE_TRT_CONVERTER(clip);
+USE_TRT_CONVERTER(gather);
+USE_TRT_CONVERTER(anchor_generator);
+USE_TRT_CONVERTER(yolo_box);
+USE_TRT_CONVERTER(roi_align);
+USE_TRT_CONVERTER(affine_channel);
+USE_TRT_CONVERTER(multiclass_nms);
+USE_TRT_CONVERTER(nearest_interp);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
index 0d9f3d2aa237acaf3bd7adb031b1f2a73c555352..c265721db577527170b1c1c1e4ac8df28de1485d 100755
--- a/paddle/fluid/inference/api/demo_ci/clean.sh
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -x
 cd `dirname $0`
 rm -rf build/ data/
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index e11a5b9c3372a746a9ea57811fa07ee8b5c96018..53f925966662667571ef39a5d51dc4536479c295 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -88,7 +88,7 @@ for WITH_STATIC_LIB in ON OFF; do
       return 0
     fi
     # -----simple_on_word2vec on windows-----
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+    cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=simple_on_word2vec \
       -DWITH_GPU=$TEST_GPU_CPU \
@@ -107,7 +107,7 @@ for WITH_STATIC_LIB in ON OFF; do
 
     # -----vis_demo on windows-----
     rm -rf *
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+    cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \
       -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=vis_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
diff --git a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
index 523dafa6649b9faa019edc1c1926b5fa408e03d5..d17f516fcca5e8fd3e254e31d3f30a09a717cc5b 100644
--- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
+++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
@@ -67,7 +67,7 @@ if /i "%use_gpu%"=="Y" (
 
 rem set_path_vs_command_prompt 
 :set_vcvarsall_dir
-SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat   =======>"
+SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat   =======>"
 set tmp_var=!vcvarsall_dir!
 call:remove_space
 set vcvarsall_dir=!tmp_var!   
@@ -177,16 +177,16 @@ if /i "%use_mkl%"=="N" (
 
 if /i "%gpu_inference%"=="Y" (
     if  "%demo_name%"=="trt_mobilenet_demo" (
-      cmake .. -G "Visual Studio 14 2015 Win64"  -T host=x64 -DWITH_GPU=ON ^
+      cmake .. -G "Visual Studio 15 2017 Win64"  -T host=x64 -DWITH_GPU=ON ^
       -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^
       -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON -DCUDA_LIB="%cuda_lib_dir%" -DUSE_TENSORRT=ON
     ) else (
-      cmake .. -G "Visual Studio 14 2015 Win64"  -T host=x64 -DWITH_GPU=ON ^
+      cmake .. -G "Visual Studio 15 2017 Win64"  -T host=x64 -DWITH_GPU=ON ^
       -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^
       -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON -DCUDA_LIB="%cuda_lib_dir%"
     )
 ) else (
-    cmake .. -G "Visual Studio 14 2015 Win64"  -T host=x64 -DWITH_GPU=OFF ^
+    cmake .. -G "Visual Studio 15 2017 Win64"  -T host=x64 -DWITH_GPU=OFF ^
     -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^
     -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON
 )
diff --git a/paddle/fluid/inference/api/demo_ci/windows_inference.md b/paddle/fluid/inference/api/demo_ci/windows_inference.md
index 73938cb995f17a0ccd8df7effce18c4f81c03916..c646c351462d460d2a3f8c236cf4f870f1d4ad30 100644
--- a/paddle/fluid/inference/api/demo_ci/windows_inference.md
+++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md
@@ -8,7 +8,7 @@
 3. 进入Paddle/paddle/fluid/inference/api/demo_ci目录，新建build目录，然后使用cmake生成vs2015的solution文件。
 其中PADDLE_LIB是前面的paddle_inference.lib对应文件夹, CUDA_LIB指定为x64格式下的cuda系统库目录文件夹。
 ```shell
- cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_inference.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64
+ cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_inference.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64
 ```
 然后用vs2015打开对应的项目文件，注意使用静态链接 "/MT"，生成对应的exe。将openblas.dll放到exe所在目录。
 
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
index 9cc491e10d691a206dd903b78c0ea570741da44c..d78560239de50eb224641583d62b55bac75be465 100644
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/extension/include/ext_op_meta_info.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace inference {
@@ -40,5 +43,20 @@ std::string to_string<std::vector<std::vector<float>>>(
   return ss.str();
 }
 
+void RegisterAllCustomOperator() {
+  auto &op_meta_info_map = OpMetaInfoMap::Instance();
+  const auto &meta_info_map = op_meta_info_map.GetMap();
+  for (auto &pair : meta_info_map) {
+    const auto &all_op_kernels{framework::OperatorWithKernel::AllOpKernels()};
+    if (all_op_kernels.find(pair.first) == all_op_kernels.end()) {
+      framework::RegisterOperatorWithMetaInfo(pair.second);
+    } else {
+      LOG(INFO) << "The operator `" << pair.first
+                << "` has been registered. "
+                   "Therefore, we will not repeat the registration here.";
+    }
+  }
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 14b968f5834da8618f6af16aa8c25e1d1baaae5e..c6d25137594b76a1ff67d9fb25b2480372c3eefa 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -398,5 +398,7 @@ static bool IsFileExists(const std::string &path) {
   return exists;
 }
 
+void RegisterAllCustomOperator();
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 793fc53d90b768050572a3dd0a080a5d30e959a2..f6cdbb00b50453d4c4ff7fc06ba82aa042dd194a 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -411,7 +411,8 @@ void AnalysisPredictor::MkldnnQuantizer::ClearDeviceContext() const {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   platform::MKLDNNDeviceContext* dev_ctx =
       (platform::MKLDNNDeviceContext*)pool.Get(predictor_.place_);
-  dev_ctx->ResetBlobMap();
+  dev_ctx->ResetBlobMap(
+      paddle::platform::MKLDNNDeviceContext::tls().get_curr_exec());
 }
 
 void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index e492b32cb6cbefcc121b616450170e5cc22bb913..2bbd4bb837a22f672e5aa625f299424b6f0c5b88 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -177,7 +177,10 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void DisableGpu();
 
-  void EnableXpu(int l3_workspace_size = 0xfffc00);
+  void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false,
+                 bool autotune = true, const std::string& autotune_file = "",
+                 const std::string& precision = "int16",
+                 bool adaptive_seqlen = false);
   ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
@@ -360,6 +363,9 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool tensorrt_dla_enabled() { return trt_use_dla_; }
 
+  void EnableDlnne(int min_subgraph_size = 3);
+  bool dlnne_enabled() const { return use_dlnne_; }
+
   ///
   /// \brief Turn on the usage of Lite sub-graph engine.
   ///
@@ -627,6 +633,10 @@ struct PD_INFER_DECL AnalysisConfig {
   std::vector<std::string> trt_disabled_ops_{};
   bool disable_trt_plugin_fp16_{false};
 
+  // dlnne related.
+  bool use_dlnne_{false};
+  int dlnne_min_subgraph_size_{3};
+
   // memory reuse related.
   bool enable_memory_optim_{false};
 
@@ -661,6 +671,11 @@ struct PD_INFER_DECL AnalysisConfig {
   bool thread_local_stream_{false};
   bool use_xpu_{false};
   int xpu_l3_workspace_size_;
+  bool xpu_locked_;
+  bool xpu_autotune_;
+  std::string xpu_autotune_file_;
+  std::string xpu_precision_;
+  bool xpu_adaptive_seqlen_;
 
   // mkldnn related.
   int mkldnn_cache_capacity_{0};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 61fcdb7a90830da06bd41de0d97b8413d5f1f0ff..b2e3de63691c555b24eb6f1e1fb9ffcc35d400f9 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -86,6 +86,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "simplify_with_basic_ops_pass",           //
       "embedding_eltwise_layernorm_fuse_pass",  //
       "multihead_matmul_fuse_pass_v2",          //
+      "multihead_matmul_fuse_pass_v3",          //
       "skip_layernorm_fuse_pass",               //
       "conv_bn_fuse_pass",                      //
       "unsqueeze2_eltwise_fuse_pass",           //
@@ -109,6 +110,16 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "transpose_flatten_concat_fuse_pass",
 });
 
+const std::vector<std::string> kDlnneSubgraphPasses({
+    "is_test_pass",                  //
+    "delete_dropout_op_pass"         //
+    "simplify_with_basic_ops_pass",  //
+    "conv_bn_fuse_pass",             //
+    "depthwise_conv_bn_fuse_pass",   //
+    "shuffle_channel_detect_pass",   //
+    "dlnne_subgraph_pass",           //
+});
+
 const std::vector<std::string> kLiteSubgraphPasses({
 #ifdef PADDLE_WITH_LITE
     "lite_subgraph_pass",
@@ -235,8 +246,8 @@ void CpuPassStrategy::EnableMKLDNN() {
              "reshape_transpose_matmul_mkldnn_fuse_pass",  //
              "matmul_transpose_reshape_fuse_pass",         //
              // Disabled due to topology-dependent speed-up
-             //"fc_mkldnn_pass",
-             //"fc_act_mkldnn_fuse_pass",
+             // "fc_mkldnn_pass",
+             // "fc_act_mkldnn_fuse_pass",
              "batch_norm_act_fuse_pass",
              // TODO(intel): Please fix the bug on windows.
              // https://github.com/PaddlePaddle/Paddle/issues/29710
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index a725ebab35eadaaaab76a3a7c4580f95b64d827d..d7556b50031b7d63b75e1e0d12fa173f8fe9fd33 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -242,6 +242,9 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
 /// \brief List of tensorRT subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses;
 
+/// \brief List of dlnne subgraph passes.
+PD_INFER_DECL extern const std::vector<std::string> kDlnneSubgraphPasses;
+
 /// \brief List of lite subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kLiteSubgraphPasses;
 
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index 231639667244d8646faa94ca453227de923c5814..9bb52ba57802512f393c23f957cc38ddabb878b1 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -260,6 +260,22 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
   return config->config.tensorrt_engine_enabled();
 }
 
+void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  config->config.EnableDlnne(min_subgraph_size);
+}
+
+bool PD_DlnneEnabled(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  return config->config.dlnne_enabled();
+}
+
 void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
   PADDLE_ENFORCE_NOT_NULL(
       config,
diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc
index c1bf4c974fac8c80c3e8e31fbd247332a325e2aa..c4e195b6ec8fabb3b831b37fd9b46b3d81a92371 100644
--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -207,13 +207,16 @@ int PD_GetOutputNum(const PD_Predictor* predictor) {
 }
 
 const char* PD_GetInputName(const PD_Predictor* predictor, int n) {
-  static std::vector<std::string> names = predictor->predictor->GetInputNames();
+  static std::vector<std::string> names;
+  names.resize(predictor->predictor->GetInputNames().size());
+  names[n] = predictor->predictor->GetInputNames()[n];
   return names[n].c_str();
 }
 
 const char* PD_GetOutputName(const PD_Predictor* predictor, int n) {
-  static std::vector<std::string> names =
-      predictor->predictor->GetOutputNames();
+  static std::vector<std::string> names;
+  names.resize(predictor->predictor->GetOutputNames().size());
+  names[n] = predictor->predictor->GetOutputNames()[n];
   return names[n].c_str();
 }
 
diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..521d24329d46411a8674ebe783b77f5a585a6551
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc pd_utils.cc)
+
+cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference)
+
+if(NOT ON_INFER)
+    return()
+endif()
+
+# Create inference capi shared library
+cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference)
+set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c)
+if(WIN32)
+    target_link_libraries(paddle_inference_c_shared shlwapi.lib)
+endif()
diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b049e992e71dd64c6616b2ac5c951ee10ea7909
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/lod_demo.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file lod_demo.cc
+///
+/// \brief a demo for user to learn how to inference by c api.
+///  it rectify from
+///  paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+
+int main(int argc, char *argv[]) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/param").c_str());
+  PD_ConfigDisableGpu(config);
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  size_t input_num = PD_PredictorGetInputNum(predictor);
+  size_t output_num = PD_PredictorGetOutputNum(predictor);
+
+  PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor);
+  LOG(INFO) << "Predictor start run!";
+  PD_Tensor *inputs[2];
+  inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]);
+  LOG(INFO) << "Predictor start run!";
+  // inputs[0]: word, use lod memory in stack
+  int32_t shape_0[2] = {11, 1};
+  int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
+  size_t lod_layer_0[2] = {0, 11};
+  PD_OneDimArraySize layer_0;
+  layer_0.size = 2;
+  layer_0.data = lod_layer_0;
+  PD_OneDimArraySize *layer_0_ptr = &layer_0;
+  PD_TwoDimArraySize lod_0;
+  lod_0.size = 1;
+  lod_0.data = &layer_0_ptr;
+  PD_TensorReshape(inputs[0], 2, shape_0);
+  PD_TensorCopyFromCpuInt64(inputs[0], data_0);
+  PD_TensorSetLod(inputs[0], &lod_0);
+
+  // inputs[1]: mention, use lod memory in heap
+  int32_t shape_1[2] = {11, 1};
+  int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
+  PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize();
+  lod_1_ptr->size = 1;
+  lod_1_ptr->data = new PD_OneDimArraySize *[1];
+  lod_1_ptr->data[0] = new PD_OneDimArraySize();
+  lod_1_ptr->data[0]->size = 2;
+  lod_1_ptr->data[0]->data = new size_t[2];
+  lod_1_ptr->data[0]->data[0] = 0;
+  lod_1_ptr->data[0]->data[1] = 11;
+
+  PD_TensorReshape(inputs[1], 2, shape_1);
+  PD_TensorCopyFromCpuInt64(inputs[1], data_1);
+  PD_TensorSetLod(inputs[1], lod_1_ptr);
+  // retrieve the lod memory
+  delete[] lod_1_ptr->data[0]->data;
+  delete lod_1_ptr->data[0];
+  delete[] lod_1_ptr->data;
+  delete lod_1_ptr;
+  lod_1_ptr = nullptr;
+
+  PD_PredictorRun(predictor);
+  PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor *output =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output);
+
+  PD_TwoDimArraySizeDestroy(output_lod);
+  PD_TensorDestroy(output);
+  PD_OneDimArrayCstrDestroy(output_names);
+
+  PD_TensorDestroy(inputs[0]);
+  PD_TensorDestroy(inputs[1]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
diff --git a/paddle/fluid/inference/capi_exp/pd_common.h b/paddle/fluid/inference/capi_exp/pd_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b70ed7fbad297efdf1863317e3af2b69bed702b
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_common.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#if defined(_WIN32)
+#ifdef PADDLE_DLL_INFERENCE
+#define PADDLE_CAPI_EXPORT __declspec(dllexport)
+#else
+#define PADDLE_CAPI_EXPORT __declspec(dllimport)
+#endif  // PADDLE_DLL_INFERENCE
+#else
+#define PADDLE_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+
+///
+/// __pd_give means that a new object is returned. The user should make sure
+/// that the returned pointer is used exactly once as a value for an __pd_take
+/// argument. In between, it can be used as a value for as many __pd_keep
+/// arguments as the user likes.
+///
+#ifndef __pd_give
+#define __pd_give
+#endif
+///
+/// __pd_take means that the object the argument points to is taken over by the
+/// function and may no longer be used by the user as an argument to any other
+/// function. The pointer value must be one returned by a function returning an
+/// __pd_give pointer.
+///
+#ifndef __pd_take
+#define __pd_take
+#endif
+///
+/// __pd_keep means that the function will only use the object temporarily. The
+/// object which the argument points to is not taken over by the function. After
+/// the function has finished, the user can still use it as an argument to other
+/// functions.
+///
+#ifndef __pd_keep
+#define __pd_keep
+#endif
+
+typedef int8_t PD_Bool;
+#define TRUE 1
+#define FALSE 0
+
+#define PD_ENUM(type)   \
+  typedef int32_t type; \
+  enum
+
+PD_ENUM(PD_PrecisionType){PD_PRECISION_FLOAT32 = 0, PD_PRECISION_INT8,
+                          PD_PRECISION_HALF};
+
+PD_ENUM(PD_PlaceType){PD_PLACE_UNK = -1, PD_PLACE_CPU, PD_PLACE_GPU,
+                      PD_PLACE_XPU};
+
+PD_ENUM(PD_DataType){
+    PD_DATA_UNK = -1, PD_DATA_FLOAT32, PD_DATA_INT32,
+    PD_DATA_INT64,    PD_DATA_UINT8,
+};
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c45454e86bdaac5e8f054da91410eab7e2b873a2
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -0,0 +1,382 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_config.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_NULL_POINTER_PARM(param)                  \
+  PADDLE_ENFORCE_NOT_NULL(                              \
+      param, paddle::platform::errors::InvalidArgument( \
+                 "The pointer of " #param " shouldn't be nullptr"))
+
+#define CHECK_AND_CONVERT_PD_CONFIG                                         \
+  PADDLE_ENFORCE_NOT_NULL(                                                  \
+      pd_config, paddle::platform::errors::InvalidArgument(                 \
+                     "The pointer of paddle config shouldn't be nullptr")); \
+  Config* config = reinterpret_cast<Config*>(pd_config)
+
+using paddle_infer::Config;
+
+static Config::Precision ConvertToCxxPrecisionType(PD_PrecisionType precision) {
+  switch (precision) {
+    case PD_PRECISION_FLOAT32:
+      return Config::Precision::kFloat32;
+    case PD_PRECISION_INT8:
+      return Config::Precision::kInt8;
+    case PD_PRECISION_HALF:
+      return Config::Precision::kHalf;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle precision type %d.", precision));
+      return Config::Precision::kFloat32;
+  }
+}
+
+extern "C" {
+__pd_give PD_Config* PD_ConfigCreate() {
+  return reinterpret_cast<PD_Config*>(new Config());
+}
+
+void PD_ConfigDestroy(__pd_take PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  delete reinterpret_cast<Config*>(config);
+}
+
+void PD_ConfigSetModel(__pd_keep PD_Config* pd_config,
+                       const char* prog_file_path,
+                       const char* params_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(prog_file_path);
+  CHECK_NULL_POINTER_PARM(params_file_path);
+  config->SetModel(prog_file_path, params_file_path);
+}
+void PD_ConfigSetProgFile(__pd_keep PD_Config* pd_config,
+                          const char* prog_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(prog_file_path);
+  config->SetProgFile(prog_file_path);
+}
+void PD_ConfigSetParamsFile(__pd_keep PD_Config* pd_config,
+                            const char* params_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(params_file_path);
+  config->SetParamsFile(params_file_path);
+}
+void PD_ConfigSetOptimCacheDir(__pd_keep PD_Config* pd_config,
+                               const char* opt_cache_dir) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(opt_cache_dir);
+  config->SetOptimCacheDir(opt_cache_dir);
+}
+
+void PD_ConfigSetModelDir(__pd_keep PD_Config* pd_config,
+                          const char* model_dir) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(model_dir);
+  config->SetModel(model_dir);
+}
+const char* PD_ConfigGetModelDir(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->model_dir().c_str();
+}
+const char* PD_ConfigGetProgFile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->prog_file().c_str();
+}
+const char* PD_ConfigGetParamsFile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->params_file().c_str();
+}
+
+void PD_ConfigDisableFCPadding(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableFCPadding();
+}
+PD_Bool PD_ConfigUseFcPadding(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_fc_padding();
+}
+
+void PD_ConfigEnableUseGpu(__pd_keep PD_Config* pd_config,
+                           uint64_t memory_pool_init_size_mb,
+                           int32_t device_id) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableUseGpu(memory_pool_init_size_mb, device_id);
+}
+void PD_ConfigDisableGpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableGpu();
+}
+PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_gpu();
+}
+
+void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
+                        int32_t l3_workspace_size) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableXpu(l3_workspace_size);
+}
+PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_xpu();
+}
+
+int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->gpu_device_id();
+}
+int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->xpu_device_id();
+}
+int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->memory_pool_init_size_mb();
+}
+float PD_ConfigFractionOfGpuMemoryForPool(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->fraction_of_gpu_memory_for_pool();
+}
+void PD_ConfigEnableCudnn(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableCUDNN();
+}
+PD_Bool PD_ConfigCudnnEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->cudnn_enabled();
+}
+
+void PD_ConfigSwitchIrOptim(__pd_keep PD_Config* pd_config, PD_Bool x) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SwitchIrOptim(x);
+}
+PD_Bool PD_ConfigIrOptim(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->ir_optim();
+}
+
+void PD_ConfigEnableTensorRtEngine(__pd_keep PD_Config* pd_config,
+                                   int32_t workspace_size,
+                                   int32_t max_batch_size,
+                                   int32_t min_subgraph_size,
+                                   PD_PrecisionType precision,
+                                   PD_Bool use_static, PD_Bool use_calib_mode) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtEngine(
+      workspace_size, max_batch_size, min_subgraph_size,
+      ConvertToCxxPrecisionType(precision), use_static, use_calib_mode);
+}
+PD_Bool PD_ConfigTensorRtEngineEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_engine_enabled();
+}
+
+void PD_ConfigSetTrtDynamicShapeInfo(__pd_keep PD_Config* pd_config,
+                                     size_t tensor_num,
+                                     const char** tensor_name,
+                                     size_t* shapes_num, int32_t** min_shape,
+                                     int32_t** max_shape, int32_t** optim_shape,
+                                     PD_Bool disable_trt_plugin_fp16) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::map<std::string, std::vector<int>> min_input_shapes;
+  std::map<std::string, std::vector<int>> max_input_shapes;
+  std::map<std::string, std::vector<int>> optim_input_shapes;
+  for (size_t tensor_index = 0; tensor_index < tensor_num; ++tensor_index) {
+    std::string name(tensor_name[tensor_index]);
+    std::vector<int> min_input_shape, max_input_shape, optim_input_shape;
+    for (size_t shape_index = 0; shape_index < shapes_num[tensor_index];
+         ++shape_index) {
+      min_input_shape.emplace_back(min_shape[tensor_index][shape_index]);
+      max_input_shape.emplace_back(max_shape[tensor_index][shape_index]);
+      optim_input_shape.emplace_back(optim_shape[tensor_index][shape_index]);
+    }
+    min_input_shapes[name] = std::move(min_input_shape);
+    max_input_shapes[name] = std::move(max_input_shape);
+    optim_input_shapes[name] = std::move(optim_input_shape);
+  }
+  config->SetTRTDynamicShapeInfo(min_input_shapes, max_input_shapes,
+                                 optim_input_shapes, disable_trt_plugin_fp16);
+}
+
+void PD_ConfigDisableTensorRtOPs(__pd_keep PD_Config* pd_config, size_t ops_num,
+                                 const char** ops_name) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::vector<std::string> ops_list;
+  for (size_t index = 0; index < ops_num; ++index) {
+    ops_list.emplace_back(ops_name[index]);
+  }
+  config->Exp_DisableTensorRtOPs(ops_list);
+}
+
+void PD_ConfigEnableTensorRtOSS(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtOSS();
+}
+PD_Bool PD_ConfigTensorRtOssEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_oss_enabled();
+}
+
+void PD_ConfigEnableTensorRtDla(__pd_keep PD_Config* pd_config,
+                                int32_t dla_core) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtDLA(dla_core);
+}
+PD_Bool PD_ConfigTensorRtDlaEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_dla_enabled();
+}
+
+void PD_ConfigEnableLiteEngine(__pd_keep PD_Config* pd_config,
+                               PD_PrecisionType precision, PD_Bool zero_copy,
+                               size_t passes_filter_num,
+                               const char** passes_filter,
+                               size_t ops_filter_num, const char** ops_filter) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::vector<std::string> passes_filters, ops_filters;
+  for (size_t index = 0; index < passes_filter_num; ++index) {
+    passes_filters.emplace_back(passes_filter[index]);
+  }
+  for (size_t index = 0; index < ops_filter_num; ++index) {
+    ops_filters.emplace_back(ops_filter[index]);
+  }
+  config->EnableLiteEngine(ConvertToCxxPrecisionType(precision), zero_copy,
+                           passes_filters, ops_filters);
+}
+PD_Bool PD_ConfigLiteEngineEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->lite_engine_enabled();
+}
+
+void PD_ConfigSwitchIrDebug(__pd_keep PD_Config* pd_config, PD_Bool x) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SwitchIrDebug(x);
+}
+void PD_ConfigEnableMKLDNN(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMKLDNN();
+}
+void PD_ConfigSetMkldnnCacheCapacity(__pd_keep PD_Config* pd_config,
+                                     int32_t capacity) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetMkldnnCacheCapacity(capacity);
+}
+PD_Bool PD_ConfigMkldnnEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_enabled();
+}
+void PD_ConfigSetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
+}
+int32_t PD_ConfigGetCpuMathLibraryNumThreads(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->cpu_math_library_num_threads();
+}
+
+void PD_ConfigSetMkldnnOp(__pd_keep PD_Config* pd_config, size_t ops_num,
+                          const char** op_list) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::unordered_set<std::string> op_names;
+  for (size_t index = 0; index < ops_num; ++index) {
+    op_names.emplace(op_list[index]);
+  }
+  config->SetMKLDNNOp(std::move(op_names));
+}
+void PD_ConfigEnableMkldnnQuantizer(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMkldnnQuantizer();
+}
+void PD_ConfigEnableMkldnnBfloat16(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMkldnnBfloat16();
+}
+PD_Bool PD_ConfigMkldnnBfloat16Enabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_bfloat16_enabled();
+}
+void PD_ConfigSetBfloat16Op(__pd_keep PD_Config* pd_config, size_t ops_num,
+                            const char** op_list) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::unordered_set<std::string> op_names;
+  for (size_t index = 0; index < ops_num; ++index) {
+    op_names.emplace(op_list[index]);
+  }
+  config->SetBfloat16Op(std::move(op_names));
+}
+PD_Bool PD_ConfigThreadLocalStreamEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->thread_local_stream_enabled();
+}
+PD_Bool PD_ConfigMkldnnQuantizerEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_quantizer_enabled();
+}
+void PD_ConfigSetModelBuffer(__pd_keep PD_Config* pd_config,
+                             const char* prog_buffer, size_t prog_buffer_size,
+                             const char* params_buffer,
+                             size_t params_buffer_size) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
+                         params_buffer_size);
+}
+PD_Bool PD_ConfigModelFromMemory(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->model_from_memory();
+}
+void PD_ConfigEnableMemoryOptim(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMemoryOptim();
+}
+PD_Bool PD_ConfigMemoryOptimEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->enable_memory_optim();
+}
+void PD_ConfigEnableProfile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableProfile();
+}
+PD_Bool PD_ConfigProfileEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->profile_enabled();
+}
+void PD_ConfigDisableGlogInfo(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableGlogInfo();
+}
+PD_Bool PD_ConfigGlogInfoDisabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->glog_info_disabled();
+}
+void PD_ConfigSetInvalid(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetInValid();
+}
+PD_Bool PD_ConfigIsValid(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->is_valid();
+}
+void PD_ConfigEnableGpuMultiStream(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableGpuMultiStream();
+}
+void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->PartiallyRelease();
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..e44983e24484eae930afa6b84db397ac3aad8f08
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -0,0 +1,571 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_config.h
+///
+/// \brief interface for paddle config
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Config PD_Config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Create a paddle config
+///
+/// \return new config.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Config* PD_ConfigCreate();
+///
+/// \brief Destroy the paddle config
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDestroy(__pd_take PD_Config* pd_config);
+///
+/// \brief Set the combined model with two specific pathes for program and
+/// parameters.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_file_path model file path of the combined model.
+/// \param[in] params_file_path params file path of the combined model.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModel(__pd_keep PD_Config* pd_config,
+                                                 const char* prog_file_path,
+                                                 const char* params_file_path);
+///
+/// \brief Set the model file path of a combined model.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_file_path model file path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetProgFile(
+    __pd_keep PD_Config* pd_config, const char* prog_file_path);
+///
+/// \brief Set the params file path of a combined model.
+///
+/// \param[in] pd_onfig config
+/// \param[in] params_file_path params file path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetParamsFile(
+    __pd_keep PD_Config* pd_config, const char* params_file_path);
+///
+/// \brief Set the path of optimization cache directory.
+/// \param[in] pd_onfig config
+/// \param[in] opt_cache_dir the path of optimization cache directory.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetOptimCacheDir(
+    __pd_keep PD_Config* pd_config, const char* opt_cache_dir);
+///
+/// \brief Set the no-combined model dir path.
+/// \param[in] pd_onfig config
+/// \param[in] model_dir model dir path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelDir(
+    __pd_keep PD_Config* pd_config, const char* model_dir);
+///
+/// \brief Get the model directory path.
+///
+/// \param[in] pd_onfig config
+/// \return The model directory path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetModelDir(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the program file path.
+///
+/// \param[in] pd_onfig config
+/// \return The program file path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetProgFile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the params file path.
+///
+/// \param[in] pd_onfig config
+/// \return The params file path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetParamsFile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn off FC Padding.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableFCPadding(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether fc padding is used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether fc padding is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseFcPadding(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on GPU.
+///
+/// \param[in] pd_onfig config
+/// \param[in] memory_pool_init_size_mb initial size of the GPU memory pool in
+/// MB.
+/// \param[in] device_id device_id the GPU card to use.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableUseGpu(
+    __pd_keep PD_Config* pd_config, uint64_t memory_pool_init_size_mb,
+    int32_t device_id);
+///
+/// \brief Turn off GPU.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the GPU is turned on.
+///
+/// \brief Turn off GPU.
+/// \return Whether the GPU is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on XPU.
+///
+/// \param[in] pd_onfig config
+/// \param[in] l3_workspace_size l3 workspace size.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
+    __pd_keep PD_Config* pd_config, int32_t l3_workspace_size);
+///
+/// \brief A boolean state telling whether the XPU is turned on.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the XPU is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the GPU device id.
+///
+/// \param[in] pd_onfig config
+/// \return The GPU device id.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the XPU device id.
+///
+/// \param[in] pd_onfig config
+/// \return The XPU device id.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the initial size in MB of the GPU memory pool.
+///
+/// \param[in] pd_onfig config
+/// \return The initial size in MB of the GPU memory pool.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigMemoryPoolInitSizeMb(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the proportion of the initial memory pool size compared to the
+/// device.
+///
+/// \param[in] pd_onfig config
+/// \return The proportion of the initial memory pool size.
+///
+PADDLE_CAPI_EXPORT extern float PD_ConfigFractionOfGpuMemoryForPool(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on CUDNN.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableCudnn(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use CUDNN.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use CUDNN.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigCudnnEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Control whether to perform IR graph optimization.
+/// If turned off, the AnalysisConfig will act just like a NativeConfig.
+///
+/// \param[in] pd_onfig config
+/// \param[in] x Whether the ir graph optimization is actived.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrOptim(
+    __pd_keep PD_Config* pd_config, PD_Bool x);
+///
+/// \brief A boolean state telling whether the ir graph optimization is
+/// actived.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use ir graph optimization.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on the TensorRT engine.
+/// The TensorRT engine will accelerate some subgraphes in the original Fluid
+/// computation graph. In some models such as resnet50, GoogleNet and so on,
+/// it gains significant performance acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] workspace_size The memory size(in byte) used for TensorRT
+/// workspace.
+/// \param[in] max_batch_size The maximum batch size of this prediction task,
+/// better set as small as possible for less performance loss.
+/// \param[in] min_subgrpah_size The minimum TensorRT subgraph size needed, if a
+/// subgraph is smaller than this, it will not be transferred to TensorRT
+/// engine.
+/// \param[in] precision The precision used in TensorRT.
+/// \param[in] use_static Serialize optimization information to disk for
+/// reusing.
+/// \param[in] use_calib_mode Use TRT int8 calibration(post training
+/// quantization).
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtEngine(
+    __pd_keep PD_Config* pd_config, int32_t workspace_size,
+    int32_t max_batch_size, int32_t min_subgraph_size,
+    PD_PrecisionType precision, PD_Bool use_static, PD_Bool use_calib_mode);
+///
+/// \brief A boolean state telling whether the TensorRT engine is used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the TensorRT engine is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtEngineEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set min, max, opt shape for TensorRT Dynamic shape mode.
+///
+/// \param[in] pd_onfig config
+/// \param[in] tensor_num The number of the subgraph input.
+/// \param[in] tensor_name The name of every subgraph input.
+/// \param[in] shapes_num The shape size of every subgraph input.
+/// \param[in] min_shape The min input shape of every subgraph input.
+/// \param[in] max_shape The max input shape of every subgraph input.
+/// \param[in] optim_shape The opt input shape of every subgraph input.
+/// \param[in] disable_trt_plugin_fp16 Setting this parameter to true means that
+/// TRT plugin will not run fp16.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetTrtDynamicShapeInfo(
+    __pd_keep PD_Config* pd_config, size_t tensor_num, const char** tensor_name,
+    size_t* shapes_num, int32_t** min_shape, int32_t** max_shape,
+    int32_t** optim_shape, PD_Bool disable_trt_plugin_fp16);
+///
+/// \brief Prevent ops running in Paddle-TRT
+/// NOTE: just experimental, not an official stable API, easy to be broken.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num ops number
+/// \param[in] ops_name ops name
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableTensorRtOPs(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** ops_name);
+///
+/// \brief Replace some TensorRT plugins to TensorRT OSS(
+/// https://github.com/NVIDIA/TensorRT), with which some models's inference
+/// may be more high-performance. Libnvinfer_plugin.so greater than
+/// V7.2.1 is needed.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtOSS(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use the TensorRT OSS.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the TensorRT OSS.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtOssEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Enable TensorRT DLA
+///
+/// \param[in] pd_onfig config
+/// \param[in] dla_core ID of DLACore, which should be 0, 1,
+///        ..., IBuilder.getNbDLACores() - 1
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtDla(
+    __pd_keep PD_Config* pd_config, int32_t dla_core);
+///
+/// \brief A boolean state telling whether to use the TensorRT DLA.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the TensorRT DLA.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtDlaEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on the usage of Lite sub-graph engine.
+///
+/// \param[in] pd_onfig config
+/// \param[in] precision Precion used in Lite sub-graph engine.
+/// \param[in] zero_copy whether use zero copy.
+/// \param[in] passes_filter_num The number of passes used in Lite sub-graph
+/// engine.
+/// \param[in] passes_filter The name of passes used in Lite sub-graph engine.
+/// \param[in] ops_filter_num The number of operators not supported by Lite.
+/// \param[in] ops_filter The name of operators not supported by Lite.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableLiteEngine(
+    __pd_keep PD_Config* pd_config, PD_PrecisionType precision,
+    PD_Bool zero_copy, size_t passes_filter_num, const char** passes_filter,
+    size_t ops_filter_num, const char** ops_filter);
+///
+/// \brief A boolean state indicating whether the Lite sub-graph engine is
+/// used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the Lite sub-graph engine is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigLiteEngineEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Control whether to debug IR graph analysis phase.
+/// This will generate DOT files for visualizing the computation graph after
+/// each analysis pass applied.
+///
+/// \param[in] pd_onfig config
+/// \param[in] x whether to debug IR graph analysis phase.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrDebug(
+    __pd_keep PD_Config* pd_config, PD_Bool x);
+///
+/// \brief Turn on MKLDNN.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMKLDNN(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the cache capacity of different input shapes for MKLDNN.
+/// Default value 0 means not caching any shape.
+/// Please see MKL-DNN Data Caching Design Document:
+/// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md
+///
+/// \param[in] pd_onfig config
+/// \param[in] capacity The cache capacity.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnCacheCapacity(
+    __pd_keep PD_Config* pd_config, int32_t capacity);
+///
+/// \brief A boolean state telling whether to use the MKLDNN.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the MKLDNN.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the number of cpu math library threads.
+///
+/// \param[in] pd_onfig config
+/// \param cpu_math_library_num_threads The number of cpu math library
+/// threads.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads);
+///
+/// \brief An int state telling how many threads are used in the CPU math
+/// library.
+///
+/// \param[in] pd_onfig config
+/// \return The number of threads used in the CPU math library.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Specify the operator type list to use MKLDNN acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num The number of operator type list.
+/// \param[in] op_list The name of operator type list.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnOp(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
+///
+/// \brief Turn on MKLDNN quantization.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnQuantizer(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the MKLDNN quantization is enabled.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnQuantizerEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on MKLDNN bfloat16.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnBfloat16(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the MKLDNN Bfloat16.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled(
+    __pd_keep PD_Config* pd_config);
+/// \brief Specify the operator type list to use Bfloat16 acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num The number of operator type list.
+/// \param[in] op_list The name of operator type list.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetBfloat16Op(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
+///
+/// \brief Enable the GPU multi-computing stream feature.
+/// NOTE: The current behavior of this interface is to bind the computation
+/// stream to the thread, and this behavior may be changed in the future.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableGpuMultiStream(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the thread local CUDA stream is
+/// enabled.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the thread local CUDA stream is enabled.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigThreadLocalStreamEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Specify the memory buffer of program and parameter.
+/// Used when model and params are loaded directly from memory.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_buffer The memory buffer of program.
+/// \param[in] prog_buffer_size The size of the model data.
+/// \param[in] params_buffer The memory buffer of the combined parameters file.
+/// \param[in] params_buffer_size The size of the combined parameters data.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelBuffer(
+    __pd_keep PD_Config* pd_config, const char* prog_buffer,
+    size_t prog_buffer_size, const char* params_buffer,
+    size_t params_buffer_size);
+///
+/// \brief A boolean state telling whether the model is set from the CPU
+/// memory.
+///
+/// \param[in] pd_onfig config
+/// \return Whether model and params are loaded directly from memory.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigModelFromMemory(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on memory optimize
+/// NOTE still in development.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMemoryOptim(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the memory optimization is
+/// activated.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the memory optimization is activated.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMemoryOptimEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on profiling report.
+/// If not turned on, no profiling report will be generated.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableProfile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the profiler is activated.
+///
+/// \param[in] pd_onfig config
+/// \return bool Whether the profiler is activated.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigProfileEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Mute all logs in Paddle inference.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGlogInfo(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether logs in Paddle inference are muted.
+///
+/// \param[in] pd_onfig config
+/// \return Whether logs in Paddle inference are muted.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigGlogInfoDisabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the Config to be invalid.
+/// This is to ensure that an Config can only be used in one
+/// Predictor.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetInvalid(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the Config is valid.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the Config is valid.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIsValid(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Partially release the memory
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigPartiallyRelease(
+    __pd_keep PD_Config* pd_config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/inference/capi_exp/pd_inference_api.h
similarity index 58%
rename from paddle/fluid/operators/distributed/distributed_pb.h
rename to paddle/fluid/inference/capi_exp/pd_inference_api.h
index f1c662be9af67b418e17987e4eb1ff0a2809c3e3..5f21dca1a7bf6a3a74f19cf814b7138d39db8054 100644
--- a/paddle/fluid/operators/distributed/distributed_pb.h
+++ b/paddle/fluid/inference/capi_exp/pd_inference_api.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,17 +14,9 @@
 
 #pragma once
 
-#ifdef PADDLE_WITH_DISTRIBUTE
-
-#ifdef PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-#else  // PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-#endif  // PADDLE_WITH_GRPC
-
-#endif  // PADDLE_WITH_DISTRIBUTE
+#include "pd_common.h"     // NOLINT
+#include "pd_config.h"     // NOLINT
+#include "pd_predictor.h"  // NOLINT
+#include "pd_tensor.h"     // NOLINT
+#include "pd_types.h"      // NOLINT
+#include "pd_utils.h"      // NOLINT
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5287a5152957f5cda0db9dee82a7689267cd3d2
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_predictor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/types_internal.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_AND_CONVERT_PD_PREDICTOR                              \
+  PADDLE_ENFORCE_NOT_NULL(                                          \
+      pd_predictor,                                                 \
+      paddle::platform::errors::InvalidArgument(                    \
+          "The pointer of paddle predictor shouldn't be nullptr")); \
+  auto& predictor = pd_predictor->predictor
+
+extern "C" {
+__pd_give PD_Predictor* PD_PredictorCreate(__pd_take PD_Config* pd_config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      pd_config, paddle::platform::errors::InvalidArgument(
+                     "The pointer of paddle predictor shouldn't be nullptr"));
+  PD_Predictor* pd_predictor = new PD_Predictor();
+  paddle_infer::Config* config =
+      reinterpret_cast<paddle_infer::Config*>(pd_config);
+  pd_predictor->predictor = paddle_infer::CreatePredictor(*config);
+  delete config;
+  return pd_predictor;
+}
+
+__pd_give PD_Predictor* PD_PredictorClone(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Predictor* new_predictor = new PD_Predictor();
+  new_predictor->predictor = predictor->Clone();
+  return new_predictor;
+}
+
+__pd_give PD_OneDimArrayCstr* PD_PredictorGetInputNames(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetInputNames();
+  return paddle_infer::CvtVecToOneDimArrayCstr(names);
+}
+
+__pd_give PD_OneDimArrayCstr* PD_PredictorGetOutputNames(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetOutputNames();
+  return paddle_infer::CvtVecToOneDimArrayCstr(names);
+}
+
+size_t PD_PredictorGetInputNum(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->GetInputNames().size();
+}
+
+size_t PD_PredictorGetOutputNum(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->GetOutputNames().size();
+}
+__pd_give PD_Tensor* PD_PredictorGetInputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Tensor* pd_tensor = new PD_Tensor();
+  pd_tensor->tensor = predictor->GetInputHandle(name);
+  return pd_tensor;
+}
+
+__pd_give PD_Tensor* PD_PredictorGetOutputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Tensor* pd_tensor = new PD_Tensor();
+  pd_tensor->tensor = predictor->GetOutputHandle(name);
+  return pd_tensor;
+}
+
+PD_Bool PD_PredictorRun(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->Run();
+}
+
+void PD_PredictorClearIntermediateTensor(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  predictor->ClearIntermediateTensor();
+}
+
+uint64_t PD_PredictorTryShrinkMemory(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->TryShrinkMemory();
+}
+
+void PD_PredictorDestroy(__pd_take PD_Predictor* pd_predictor) {
+  delete pd_predictor;
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h
new file mode 100644
index 0000000000000000000000000000000000000000..d4542d0b6d394d2ebc67e6f63b0b52cefb5939b3
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.h
@@ -0,0 +1,148 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_predictor.h
+///
+/// \brief interface for paddle predictor
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Predictor PD_Predictor;
+typedef struct PD_Config PD_Config;
+typedef struct PD_Tensor PD_Tensor;
+typedef struct PD_OneDimArrayCstr PD_OneDimArrayCstr;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Create a new Predictor
+///
+/// \param[in] Config config
+/// \return new predicor.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorCreate(
+    __pd_take PD_Config* pd_config);
+///
+/// \brief Clone a new Predictor
+///
+/// \param[in] pd_predictor predictor
+/// \return new predictor.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorClone(
+    __pd_keep PD_Predictor* pd_predictor);
+///
+/// \brief Get the input names
+///
+/// \param[in] pd_predictor predictor
+/// \return input names
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
+PD_PredictorGetInputNames(__pd_keep PD_Predictor* pd_predictor);
+///
+/// \brief Get the output names
+///
+/// \param[in] pd_predictor predictor
+/// \return output names
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
+PD_PredictorGetOutputNames(__pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the input number
+///
+/// \param[in] pd_predictor predictor
+/// \return input number
+///
+PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetInputNum(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the output number
+///
+/// \param[in] pd_predictor predictor
+/// \return output number
+///
+PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetOutputNum(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the Input Tensor object
+///
+/// \param[in] pd_predictor predictor
+/// \param[in] name input name
+/// \return input tensor
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetInputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name);
+
+///
+/// \brief Get the Output Tensor object
+///
+/// \param[in] pd_predictor predictor
+/// \param[in] name output name
+/// \return output tensor
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetOutputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name);
+
+///
+/// \brief Run the prediction engine
+///
+/// \param[in] pd_predictor predictor
+/// \return Whether the function executed successfully
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_PredictorRun(
+    __pd_keep PD_Predictor* pd_predictor);
+
+/// \brief Clear the intermediate tensors of the predictor
+///
+/// \param[in] pd_predictor predictor
+///
+PADDLE_CAPI_EXPORT extern void PD_PredictorClearIntermediateTensor(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Release all tmp tensor to compress the size of the memory pool.
+/// The memory pool is considered to be composed of a list of chunks, if
+/// the chunk is not occupied, it can be released.
+///
+/// \param[in] pd_predictor predictor
+/// \return Number of bytes released. It may be smaller than the actual
+/// released memory, because part of the memory is not managed by the
+/// MemoryPool.
+///
+PADDLE_CAPI_EXPORT extern uint64_t PD_PredictorTryShrinkMemory(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Destroy a predictor object
+///
+/// \param[in] pd_predictor predictor
+///
+PADDLE_CAPI_EXPORT extern void PD_PredictorDestroy(
+    __pd_take PD_Predictor* pd_predictor);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c661dea6f2bb2dcb168e5d08e80794195ef2710
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_tensor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/types_internal.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_AND_CONVERT_PD_TENSOR                                         \
+  PADDLE_ENFORCE_NOT_NULL(                                                  \
+      pd_tensor, paddle::platform::errors::InvalidArgument(                 \
+                     "The pointer of paddle tensor shouldn't be nullptr")); \
+  auto& tensor = pd_tensor->tensor
+
+extern "C" {
+
+void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor) { delete pd_tensor; }
+void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, size_t shape_size,
+                      int32_t* shape) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  std::vector<int> shapes(shape_size);
+  for (size_t index = 0; index < shape_size; ++index) {
+    shapes[index] = shape[index];
+  }
+  tensor->Reshape(shapes);
+}
+
+#define REPEAT_ALL_DATA_TYPE(func)                             \
+  func(float, Float) func(int64_t, Int64) func(int32_t, Int32) \
+      func(uint8_t, Uint8) func(int8_t, Int8)
+
+#define PD_TENSOR_MUTABLE_DATA_IMPL(type, Type)                                \
+  type* PD_TensorMutableData##Type(__pd_keep PD_Tensor* pd_tensor,             \
+                                   PD_PlaceType place) {                       \
+    CHECK_AND_CONVERT_PD_TENSOR;                                               \
+    return tensor->mutable_data<type>(paddle_infer::CvtToCxxPlaceType(place)); \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_MUTABLE_DATA_IMPL)
+#undef PD_TENSOR_MUTABLE_DATA_IMPL
+
+#define PD_TENSOR_DATA_IMPL(type, Type)                                        \
+  type* PD_TensorData##Type(__pd_keep PD_Tensor* pd_tensor,                    \
+                            PD_PlaceType* place, int32_t* size) {              \
+    CHECK_AND_CONVERT_PD_TENSOR;                                               \
+    PADDLE_ENFORCE_NOT_NULL(place,                                             \
+                            paddle::platform::errors::InvalidArgument(         \
+                                "The pointer of place shouldn't be nullptr")); \
+    PADDLE_ENFORCE_NOT_NULL(size,                                              \
+                            paddle::platform::errors::InvalidArgument(         \
+                                "The pointer of size shouldn't be nullptr"));  \
+    paddle_infer::PlaceType cxx_palce_type;                                    \
+    int cxx_size;                                                              \
+    type* data = tensor->data<type>(&cxx_palce_type, &cxx_size);               \
+    *place = paddle_infer::CvtFromCxxPlaceType(cxx_palce_type);                \
+    *size = static_cast<int32_t>(cxx_size);                                    \
+    return data;                                                               \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_DATA_IMPL)
+#undef PD_TENSOR_DATA_IMPL
+
+#define PD_TENSOR_COPY_FROM_CPU_IMPL(type, Type)                  \
+  void PD_TensorCopyFromCpu##Type(__pd_keep PD_Tensor* pd_tensor, \
+                                  const type* data) {             \
+    CHECK_AND_CONVERT_PD_TENSOR;                                  \
+    tensor->CopyFromCpu<type>(data);                              \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_FROM_CPU_IMPL)
+#undef PD_TENSOR_COPY_FROM_CPU_IMPL
+
+#define PD_TENSOR_COPY_TO_CPU_IMPL(type, Type)                                \
+  void PD_TensorCopyToCpu##Type(__pd_keep PD_Tensor* pd_tensor, type* data) { \
+    CHECK_AND_CONVERT_PD_TENSOR;                                              \
+    tensor->CopyToCpu<type>(data);                                            \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_TO_CPU_IMPL)
+#undef PD_TENSOR_COPY_TO_CPU_IMPL
+
+#undef REPEAT_ALL_DATA_TYPE
+
+__pd_give PD_OneDimArrayInt32* PD_TensorGetShape(
+    __pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtVecToOneDimArrayInt32(tensor->shape());
+}
+void PD_TensorSetLod(__pd_keep PD_Tensor* pd_tensor,
+                     __pd_keep PD_TwoDimArraySize* lod) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  tensor->SetLoD(paddle_infer::CvtTwoDimArrayToVecSize(lod));
+}
+__pd_give PD_TwoDimArraySize* PD_TensorGetLod(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtVecToTwoDimArraySize(tensor->lod());
+}
+const char* PD_TensorGetName(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return tensor->name().c_str();
+}
+PD_DataType PD_TensorGetDataType(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtFromCxxDatatype(tensor->type());
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.h b/paddle/fluid/inference/capi_exp/pd_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..29ea4b5d62e43ccf44bb425f8c43c122c1b0f220
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.h
@@ -0,0 +1,287 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_tensor.h
+///
+/// \brief interface for paddle tensor
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Tensor PD_Tensor;
+typedef struct PD_OneDimArrayInt32 PD_OneDimArrayInt32;
+typedef struct PD_TwoDimArraySize PD_TwoDimArraySize;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Destroy the paddle tensor
+///
+/// \param[in] pd_tensor tensor
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor);
+
+///
+/// \brief Reset the shape of the tensor.
+/// Generally it's only used for the input tensor.
+/// Reshape must be called before calling PD_TensorMutableData*() or
+/// PD_TensorCopyFromCpu*()
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] shape_size The size of shape.
+/// \param[in] shape The shape to set.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor,
+                                                size_t shape_size,
+                                                int32_t* shape);
+
+///
+/// \brief Get the memory pointer in CPU or GPU with 'float' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern float* PD_TensorMutableDataFloat(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int64_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int64_t* PD_TensorMutableDataInt64(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int32_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int32_t* PD_TensorMutableDataInt32(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'uint8_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorMutableDataUint8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int8_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int8_t* PD_TensorMutableDataInt8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern float* PD_TensorDataFloat(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int64_t* PD_TensorDataInt64(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int32_t* PD_TensorDataInt32(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorDataUint8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int8_t* PD_TensorDataInt8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuFloat(
+    __pd_keep PD_Tensor* pd_tensor, const float* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt64(
+    __pd_keep PD_Tensor* pd_tensor, const int64_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt32(
+    __pd_keep PD_Tensor* pd_tensor, const int32_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuUint8(
+    __pd_keep PD_Tensor* pd_tensor, const uint8_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt8(
+    __pd_keep PD_Tensor* pd_tensor, const int8_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuFloat(
+    __pd_keep PD_Tensor* pd_tensor, float* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt64(
+    __pd_keep PD_Tensor* pd_tensor, int64_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt32(
+    __pd_keep PD_Tensor* pd_tensor, int32_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuUint8(
+    __pd_keep PD_Tensor* pd_tensor, uint8_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt8(
+    __pd_keep PD_Tensor* pd_tensor, int8_t* data);
+///
+/// \brief Get the tensor shape
+/// \param[in] pd_tensor tensor.
+/// \return The tensor shape.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayInt32* PD_TensorGetShape(
+    __pd_keep PD_Tensor* pd_tensor);
+
+///
+/// \brief Set the tensor lod information
+/// \param[in] pd_tensor tensor.
+/// \param[in] lod lod information.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorSetLod(
+    __pd_keep PD_Tensor* pd_tensor, __pd_keep PD_TwoDimArraySize* lod);
+///
+/// \brief Get the tensor lod information
+/// \param[in] pd_tensor tensor.
+/// \return the lod information.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_TwoDimArraySize* PD_TensorGetLod(
+    __pd_keep PD_Tensor* pd_tensor);
+///
+/// \brief Get the tensor name
+/// \param[in] pd_tensor tensor.
+/// \return the tensor name.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_TensorGetName(
+    __pd_keep PD_Tensor* pd_tensor);
+///
+/// \brief Get the tensor data type
+/// \param[in] pd_tensor tensor.
+/// \return the tensor data type.
+///
+PADDLE_CAPI_EXPORT extern PD_DataType PD_TensorGetDataType(
+    __pd_keep PD_Tensor* pd_tensor);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_types.h b/paddle/fluid/inference/capi_exp/pd_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5da2913a9b20719346c426770bd9b40d779ffd0
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_types.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_OneDimArrayInt32 {
+  size_t size;
+  int32_t* data;
+} PD_OneDimArrayInt32;  // std::vector<int32_t>
+
+typedef struct PD_OneDimArraySize {
+  size_t size;
+  size_t* data;
+} PD_OneDimArraySize;  // std::vector<size_t>
+
+typedef struct PD_OneDimArrayCstr {
+  size_t size;
+  char** data;
+} PD_OneDimArrayCstr;  // std::vector<std::string>
+
+typedef struct PD_TwoDimArraySize {
+  size_t size;
+  PD_OneDimArraySize** data;
+} PD_TwoDimArraySize;  // std::vector<std::vector<size_t>>
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e762619f5567c3fce05272815f9a8a0f17d267c
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define DESTROY_ONE_DIM_ARRAY(type)                                           \
+  void PD_OneDimArray##type##Destroy(__pd_take PD_OneDimArray##type* array) { \
+    if (array != NULL) {                                                      \
+      delete[] array->data;                                                   \
+      delete array;                                                           \
+    }                                                                         \
+  }
+#define CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type)   \
+  __pd_give PD_OneDimArray##Type* CvtVecToOneDimArray##Type( \
+      const std::vector<vec_type>& vec) {                    \
+    PD_OneDimArray##Type* array = new PD_OneDimArray##Type;  \
+    array->size = vec.size();                                \
+    array->data = vec.empty() ? NULL : new type[vec.size()]; \
+    for (size_t index = 0; index < vec.size(); ++index) {    \
+      array->data[index] = vec[index];                       \
+    }                                                        \
+    return array;                                            \
+  }
+#define CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type)   \
+  std::vector<vec_type> CvtOneDimArrayToVec##Type(           \
+      __pd_keep const PD_OneDimArray##Type* array) {         \
+    std::vector<vec_type> vec;                               \
+    if (array != NULL) {                                     \
+      vec.resize(array->size);                               \
+      for (size_t index = 0; index < array->size; ++index) { \
+        vec[index] = array->data[index];                     \
+      }                                                      \
+    }                                                        \
+    return vec;                                              \
+  }
+
+#define ONE_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \
+  extern "C" {                                              \
+  DESTROY_ONE_DIM_ARRAY(Type);                              \
+  }                                                         \
+  namespace paddle_infer {                                  \
+  CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type)        \
+  CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type)        \
+  }
+
+ONE_DIM_ARRAY_UTILS_FUNC_IMPL(int32_t, Int32, int)
+ONE_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
+
+#undef ONE_DIM_ARRAY_UTILS_FUNC_IMPL
+#undef CONVERT_ONE_DIM_ARRAY_TO_VEC
+#undef CONVERT_VEC_TO_ONE_DIM_ARRAY
+#undef DESTROY_ONE_DIM_ARRAY
+
+void PD_OneDimArrayCstrDestroy(__pd_take PD_OneDimArrayCstr* array) {
+  if (array != NULL) {
+    if (array->size != 0) {
+      for (size_t index = 0; index < array->size; ++index) {
+        delete[] array->data[index];
+      }
+    }
+    delete[] array->data;
+    delete array;
+  }
+}
+namespace paddle_infer {
+
+__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
+    const std::vector<std::string>& vec) {
+  PD_OneDimArrayCstr* array = new PD_OneDimArrayCstr;
+  array->size = vec.size();
+  array->data = vec.empty() ? NULL : new char*[vec.size()];
+  for (size_t index = 0u; index < vec.size(); ++index) {
+    array->data[index] = new char[vec[index].size() + 1];
+    memcpy(array->data[index], vec[index].c_str(), vec[index].size() + 1);
+  }
+  return array;
+}
+
+std::vector<std::string> CvtOneDimArrayToVecCstr(
+    __pd_keep const PD_OneDimArrayCstr* array) {
+  std::vector<std::string> vec;
+  for (size_t index = 0; index < array->size; ++index) {
+    vec.emplace_back(array->data[index]);
+  }
+  return vec;
+}
+
+}  // namespace paddle_infer
+
+#define DESTROY_TWO_DIM_ARRAY(type)                                           \
+  void PD_TwoDimArray##type##Destroy(__pd_take PD_TwoDimArray##type* array) { \
+    if (array != NULL) {                                                      \
+      if (array->size != 0) {                                                 \
+        for (size_t index = 0; index < array->size; ++index) {                \
+          PD_OneDimArray##type##Destroy(array->data[index]);                  \
+        }                                                                     \
+      }                                                                       \
+      delete[] array->data;                                                   \
+      delete array;                                                           \
+    }                                                                         \
+  }
+#define CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type)                    \
+  __pd_give PD_TwoDimArray##Type* CvtVecToTwoDimArray##Type(                  \
+      const std::vector<std::vector<vec_type>>& vec) {                        \
+    PD_TwoDimArray##Type* array = new PD_TwoDimArray##Type;                   \
+    array->size = vec.size();                                                 \
+    array->data = vec.empty() ? NULL : new PD_OneDimArray##Type*[vec.size()]; \
+    for (size_t index = 0; index < vec.size(); ++index) {                     \
+      array->data[index] = CvtVecToOneDimArray##Type(vec[index]);             \
+    }                                                                         \
+    return array;                                                             \
+  }
+#define CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type)            \
+  std::vector<std::vector<vec_type>> CvtTwoDimArrayToVec##Type(       \
+      __pd_keep const PD_TwoDimArray##Type* array) {                  \
+    std::vector<std::vector<vec_type>> vec;                           \
+    if (array != NULL && array->size != 0) {                          \
+      vec.resize(array->size);                                        \
+      for (size_t index = 0; index < array->size; ++index) {          \
+        vec[index] = CvtOneDimArrayToVec##Type((array->data)[index]); \
+      }                                                               \
+    }                                                                 \
+    return vec;                                                       \
+  }
+#define TWO_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \
+  extern "C" {                                              \
+  DESTROY_TWO_DIM_ARRAY(Type);                              \
+  }                                                         \
+  namespace paddle_infer {                                  \
+  CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type)        \
+  CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type)        \
+  }
+
+TWO_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
+
+#undef TWO_DIM_ARRAY_UTILS_FUNC_IMPL
+#undef CONVERT_TWO_DIM_ARRAY_TO_VEC
+#undef CONVERT_VEC_TO_TWO_DIM_ARRAY
+#undef DESTROY_TWO_DIM_ARRAY
+
+namespace paddle_infer {
+
+PlaceType CvtToCxxPlaceType(PD_PlaceType place_type) {
+  switch (place_type) {
+    case PD_PLACE_UNK:
+      return PlaceType::kUNK;
+    case PD_PLACE_CPU:
+      return PlaceType::kCPU;
+    case PD_PLACE_GPU:
+      return PlaceType::kGPU;
+    case PD_PLACE_XPU:
+      return PlaceType::kXPU;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle place type %d.", place_type));
+      return PlaceType::kUNK;
+  }
+}
+
+PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type) {
+  switch (place_type) {
+    case PlaceType::kCPU:
+      return PD_PLACE_CPU;
+    case PlaceType::kGPU:
+      return PD_PLACE_GPU;
+    case PlaceType::kXPU:
+      return PD_PLACE_XPU;
+    default:
+      return PD_PLACE_UNK;
+  }
+}
+
+DataType CvtToCxxDatatype(PD_DataType data_type) {
+  switch (data_type) {
+    case PD_DATA_FLOAT32:
+      return DataType::FLOAT32;
+    case PD_DATA_INT64:
+      return DataType::INT64;
+    case PD_DATA_INT32:
+      return DataType::INT32;
+    case PD_DATA_UINT8:
+      return DataType::UINT8;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle data type %d.", data_type));
+      return DataType::FLOAT32;
+  }
+}
+
+PD_DataType CvtFromCxxDatatype(DataType data_type) {
+  switch (data_type) {
+    case DataType::FLOAT32:
+      return PD_DATA_FLOAT32;
+    case DataType::INT64:
+      return PD_DATA_INT64;
+    case DataType::INT32:
+      return PD_DATA_INT32;
+    case DataType::UINT8:
+      return PD_DATA_UINT8;
+    default:
+      return PD_DATA_UNK;
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.h b/paddle/fluid/inference/capi_exp/pd_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..68e519d4bb5e959dd618b5aa31d090c7a74dc2a7
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_utils.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_utils.h
+///
+/// \brief Some utility function to destroy paddle struct.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pd_types.h"  // NOLINT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Destroy the PD_OneDimArrayInt32 object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArrayInt32 object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArrayInt32Destroy(
+    __pd_take PD_OneDimArrayInt32* array);
+
+///
+/// \brief Destroy the PD_OneDimArrayCstr object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArrayCstr object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArrayCstrDestroy(
+    __pd_take PD_OneDimArrayCstr* array);
+
+///
+/// \brief Destroy the PD_OneDimArraySize object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArraySize object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArraySizeDestroy(
+    __pd_take PD_OneDimArraySize* array);
+
+///
+/// \brief Destroy the PD_TwoDimArraySize object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_TwoDimArraySize object.
+///
+PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy(
+    __pd_take PD_TwoDimArraySize* array);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/types_internal.h b/paddle/fluid/inference/capi_exp/types_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a61b9a884c3bfe7c3b6cc3734536a36872f74d0
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/types_internal.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_common.h"
+
+typedef struct PD_Tensor {
+  std::unique_ptr<paddle_infer::Tensor> tensor;
+} PD_Tensor;
+
+typedef struct PD_Predictor {
+  std::shared_ptr<paddle_infer::Predictor> predictor;
+} PD_Predictor;
diff --git a/paddle/fluid/inference/capi_exp/utils_internal.h b/paddle/fluid/inference/capi_exp/utils_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbae512ecd85578cf236100ba1e0b00a6c18775f
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/utils_internal.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file utils_internal.h
+///
+/// \brief Some utility function used to convert object between C Struct and C++
+/// Class.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+
+namespace paddle_infer {
+
+///
+/// \brief Convert the 'std::vector<int>' object to a 'PD_OneDimArrayInt32'
+/// object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArrayInt32* CvtVecToOneDimArrayInt32(
+    const std::vector<int>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArrayInt32' object to a 'std::vector<int>'
+/// object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<int> CvtOneDimArrayToVecInt32(
+    __pd_keep const PD_OneDimArrayInt32* array);
+
+///
+/// \brief Convert the 'std::vector<size_t>' object to a 'PD_OneDimArraySize'
+/// object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArraySize* CvtVecToOneDimArraySize(
+    const std::vector<size_t>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArraySize' object to a 'std::vector<size_t>'
+/// object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<size_t> CvtOneDimArrayToVecSize(
+    __pd_keep const PD_OneDimArraySize* array);
+
+///
+/// \brief Convert the 'std::vector<std::string>' object to a
+/// 'PD_OneDimArrayCstr' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
+    const std::vector<std::string>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArrayCstr' object to a
+/// 'std::vector<std::string>' object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<std::string> CvtOneDimArrayToVecCstr(
+    __pd_keep const PD_OneDimArrayCstr* array);
+
+///
+/// \brief Convert the 'std::vector<std::vector<size_t>>' object to a
+/// 'PD_TwoDimArraySize' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_TwoDimArraySize* CvtVecToTwoDimArraySize(
+    const std::vector<std::vector<size_t>>& vec);
+
+///
+/// \brief Convert the 'PD_TwoDimArraySize' object to a
+/// 'std::vector<std::vector<size_t>>' object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<std::vector<size_t>> CvtTwoDimArrayToVecSize(
+    __pd_keep const PD_TwoDimArraySize* array);
+
+///
+/// \brief Convert the 'PD_PlaceType' object to a 'paddle_infer::PlaceType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PlaceType CvtToCxxPlaceType(PD_PlaceType place_type);
+
+///
+/// \brief Convert the 'paddle_infer::PlaceType' object to a 'PD_PlaceType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type);
+
+///
+/// \brief Convert the 'PD_DataType' object to a 'paddle_infer::DataType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+DataType CvtToCxxDatatype(PD_DataType data_type);
+
+///
+/// \brief Convert the 'paddle_infer::DataType' object to a 'PD_DataType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PD_DataType CvtFromCxxDatatype(DataType data_type);
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h
index 1a13ba510384c010e476bf0ba0ad5b0ba84d3240..e29162cf5b23bacafcf2e5ef600a96ed4518c360 100644
--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <string>
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace inference {
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 59a786e46c98bf5972f23bd6148712eccc198aa6..908e1ab990bb73b124158f66cd0413a4b6a20907 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -59,8 +59,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
 #endif
 
 #ifdef LITE_SUBGRAPH_WITH_XPU
+  // Deprecated in Paddle-Lite release/v2.8
   lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
       cfg.xpu_l3_workspace_size);
+  lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size,
+                                          cfg.locked);
+  lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file);
+  lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision,
+                                               cfg.adaptive_seqlen);
 #endif
 
   // create predictor
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 5ba487cc24d7d58cd87853a58fc12f1a82c3610d..a64ef1eda828bf2a5fc96c1cc8435c0a4b6912c6 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -42,6 +42,11 @@ struct EngineConfig {
 
   // for xpu
   size_t xpu_l3_workspace_size;
+  bool locked = false;
+  bool autotune = true;
+  std::string autotune_file = "";
+  std::string precision = "int16";
+  bool adaptive_seqlen = false;
 
   // for x86 or arm
   int cpu_math_library_num_threads{1};
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index f9586ca1701f74f140e6a78b8758a76c1739a54a..3820ac5d7cc24693c388554acea0aad6ab49b83a 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -5,6 +5,13 @@ nv_library(tensorrt_converter
                 pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
+                gather_op.cc
+                anchor_generator_op.cc
+                yolo_box_op.cc
+                roi_align_op.cc
+                affine_channel_op.cc
+                multiclass_nms_op.cc
+                nearest_interp_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..813342c08483b7e9124929d3f00d8155d337e67e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Affine Channel Op
+ */
+class AffineChannelOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid affine_channel op to tensorrt scale nd layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("X").front();
+    std::string scale_name = op_desc.Input("Scale").front();
+    std::string bias_name = op_desc.Input("Bias").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    auto input_tensor = engine_->GetITensor(input_name);
+    auto idim = input_tensor->getDimensions();
+
+    auto* scale_v = scope.FindVar(scale_name);
+    auto* scale_t = scale_v->GetMutable<framework::LoDTensor>();
+    float* scale_ptr = engine_->GetWeightCPUData(scale_name, scale_t, false);
+
+    auto* bias_v = scope.FindVar(bias_name);
+    auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
+    float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false);
+
+    auto data_layout = framework::StringToDataLayout(
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
+
+    PADDLE_ENFORCE_EQ(
+        data_layout, framework::DataLayout::kNCHW,
+        platform::errors::InvalidArgument(
+            "TensorRT affine channel converter can only convert NCHW format. "
+            "Other format should be run in fluid mode. Report a bug on github "
+            "issue if you see this line."));
+
+    // tensorrt scalend layer only support spatial dims >= 2,
+    // so nhwc is not availabe (spatial dims == 0)
+    const int channel_axis = engine_->with_dynamic_shape();
+
+    TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT,
+                                         static_cast<void*>(scale_ptr),
+                                         (size_t)idim.d[channel_axis]};
+    TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT,
+                                        static_cast<void*>(bias_ptr),
+                                        (size_t)idim.d[channel_axis]};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    auto layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *input_tensor,
+                                      nvinfer1::ScaleMode::kCHANNEL,
+                                      bias_weights.get(), scale_weights.get(),
+                                      power_weights.get(), channel_axis);
+
+    RreplenishLayerAndOutput(layer, "affine_channel", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(affine_channel, AffineChannelOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56aab9785c90f37e170e204b20f8f00a09941018
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/* Anchor Generator Op */
+class AnchorGeneratorOpConverter : public OpConverter {
+ public:
+  void operator()(const paddle::framework::proto::OpDesc& op,
+                  const paddle::framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a fluid anchor generator op to tensorrt plugin";
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("Input").front();
+    std::string anchor_name = op_desc.Output("Anchors").front();
+    std::string variance_name = op_desc.Output("Variances").front();
+
+    auto* input = engine_->GetITensor(input_name);
+    const auto input_dims = input->getDimensions();  // C, H, W
+    std::vector<std::string> output_names{anchor_name, variance_name};
+
+    const auto anchor_sizes =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("anchor_sizes"));
+    const auto aspect_ratios =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("aspect_ratios"));
+    const auto stride =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("stride"));
+    const auto variances =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("variances"));
+    const auto offset = BOOST_GET_CONST(float, op_desc.GetAttr("offset"));
+    const int num_anchors = aspect_ratios.size() * anchor_sizes.size();
+    bool is_dynamic = engine_->with_dynamic_shape();
+    const auto height = input_dims.d[1];
+    const auto width = input_dims.d[2];
+    const int box_num = width * height * num_anchors;
+    const nvinfer1::DataType data_type = nvinfer1::DataType::kFLOAT;
+
+    nvinfer1::IPluginV2* anchor_generator_plugin = nullptr;
+    if (is_dynamic) {
+      anchor_generator_plugin = new plugin::AnchorGeneratorPluginDynamic(
+          data_type, anchor_sizes, aspect_ratios, stride, variances, offset,
+          num_anchors);
+    } else {
+      anchor_generator_plugin = new plugin::AnchorGeneratorPlugin(
+          data_type, anchor_sizes, aspect_ratios, stride, variances, offset,
+          height, width, num_anchors, box_num);
+    }
+
+    std::vector<nvinfer1::ITensor*> anchor_generator_inputs{input};
+    auto* anchor_generator_layer = engine_->network()->addPluginV2(
+        anchor_generator_inputs.data(), anchor_generator_inputs.size(),
+        *anchor_generator_plugin);
+
+    RreplenishLayerAndOutput(anchor_generator_layer, "anchor_generator",
+                             output_names, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(anchor_generator, AnchorGeneratorOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index 26cd7b22d2baaacea65a800b456e69d28955699f..7ea41839cb939ff3a1a7b3c6921b6e014bcdc1b6 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -38,38 +38,6 @@ class BatchNormOpConverter : public OpConverter {
     VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm";
 
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of batch_norm TRT converter. "
-                          "Expected 1, received %d.",
-                          op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Bias's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Bias").size()));  // Bias is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Mean's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Mean").size()));  // Mean is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Scale's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Scale").size()));  // Scale is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Variance").size(), 1,
-        platform::errors::InvalidArgument(
-            "Invalid input Variance's size of batch_norm TRT converter. "
-            "Expected 1, received %d.",
-            op_desc.Input("Variance").size()));  // Variance is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid output Y's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Output("Y").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     // Declare weights
     auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
@@ -158,17 +126,49 @@ class BatchNormOpConverter : public OpConverter {
     TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                          0};
 
-    nvinfer1::IScaleLayer* layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast<nvinfer1::ITensor*>(X),
-                             nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
-                             scale_weights.get(), power_weights.get());
+    int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0;
+    nvinfer1::ILayer* layer = nullptr;
+    nvinfer1::IShuffleLayer* expand_layer = nullptr;
+    nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
+
+    auto x_dim = X->getDimensions();
+    if (x_dim.nbDims < 3 + dynamic_shape_offset) {
+      nvinfer1::Dims expand_shape;
+      expand_shape.nbDims = 3 + dynamic_shape_offset;
+      for (int i = 0; i < 3 + dynamic_shape_offset; i++) {
+        if (i < x_dim.nbDims) {
+          expand_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i];
+        } else {
+          expand_shape.d[i] = 1;
+        }
+      }
+      expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+      expand_layer->setReshapeDimensions(expand_shape);
+      X = expand_layer->getOutput(0);
+    }
+
+    layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *X, nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(),
+        scale_weights.get(), power_weights.get());
 
     auto output_name = op_desc.Output("Y").front();
     engine_->SetWeights(op_desc.Input("Bias").front(),
                         std::move(combile_bias_tensor));
     engine_->SetWeights(op_desc.Input("Scale").front(),
                         std::move(combile_scale_tensor));
-    RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
+    if (x_dim.nbDims < 3 + dynamic_shape_offset) {
+      nvinfer1::Dims squeeze_shape;
+      squeeze_shape.nbDims = x_dim.nbDims;
+      for (int i = 0; i < squeeze_shape.nbDims; i++) {
+        squeeze_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i];
+      }
+      squeeze_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
+      squeeze_layer->setReshapeDimensions(squeeze_shape);
+      layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+    }
+    RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name},
+                             test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 5515cd35daedc70b6ecad44f4295084546386b96..61199724bcfe30dfcfc0e044a54e49b62d3a0936 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -36,18 +36,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
 
   framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 input, but got %d input.",
-                        op_desc.Input("Input").size()));
-  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 filter, but got %d filter.",
-                        op_desc.Input("Filter").size()));
-  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 output, but got %d output.",
-                        op_desc.Output("Output").size()));
 
   auto* X = engine->GetITensor(op_desc.Input("Input").front());
   std::string filter_var_name = op_desc.Input("Filter").front();
@@ -61,13 +49,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   if (enable_int8) {
 #if IS_TRT_VERSION_GE(5000)
-    if (op_desc.Type() != "conv2d_transpose") {
-      PADDLE_ENFORCE_EQ(
-          op_desc.HasAttr("Input_scale"), true,
-          platform::errors::InvalidArgument("Input scale not found. TRT int8"
-                                            " requires conv/deconv to have "
-                                            "input quantization scales."));
-    }
     float in_scale =
         BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
     auto weight_scale =
@@ -179,19 +160,11 @@ class Deconv2dOpConverter : public OpConverter {
             nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
             TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
           auto* layer =
-              TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input,
+              TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_output,
                                    ksize, weight.get(), bias.get());
           return layer;
         },
         [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-          // In trt Deconv, dilation should be 1, ohter values are not
-          // supported.
-          bool condition = (dilations.d[0] == 1 && dilations.d[1] == 1);
-          PADDLE_ENFORCE_EQ(condition, true,
-                            platform::errors::InvalidArgument(
-                                "In Deconv, Dilations must be (1, 1) for "
-                                "tensorRT, but given (%d, %d)",
-                                dilations.d[0], dilations.d[1]));
         },
         "conv2d_transpose");
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index dfadb28a6520f983986263b38be69fa48335d485..47f5cc97d39cdf785bdbcbc468714f3fe0357209 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -25,6 +25,10 @@ static bool CheckDims(const nvinfer1::Dims& dims_x,
     return false;
   }
   for (int i = 0; i < dims_x.nbDims; i++) {
+    // conservative judgment
+    if (dims_x.d[i] == -1 || dims_y.d[i] == -1) {
+      return false;
+    }
     if (dims_x.d[i] != dims_y.d[i]) {
       return false;
     }
@@ -43,25 +47,6 @@ class ElementwiseWeightOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"X\").size() "
-            "should equal to 1, but received Input(\"X\").size() = %u.",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"Y\").size() "
-            "should equal to 1, but received Input(\"Y\").size() = %u.",
-            op_desc.Input("Y").size()));  // Y is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Out").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Output(\"Out\").size() "
-            "should equal to 1, but reveceid Output(\"Out\").size() = %u.",
-            op_desc.Output("Out").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
     PADDLE_ENFORCE_NOT_NULL(
@@ -81,6 +66,25 @@ class ElementwiseWeightOpConverter : public OpConverter {
                                            0};
       TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
                                            0};
+
+      nvinfer1::IShuffleLayer* expand_layer = nullptr;
+      nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
+      int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0;
+      auto input_dim = X->getDimensions();
+      if (input_dim.nbDims < 3 + dynamic_shape_offset) {
+        nvinfer1::Dims expand_shape;
+        expand_shape.nbDims = 3 + dynamic_shape_offset;
+        for (int i = 0; i < expand_shape.nbDims; i++) {
+          if (i < input_dim.nbDims) {
+            expand_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
+          } else {
+            expand_shape.d[i] = 1;
+          }
+        }
+        expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+        expand_layer->setReshapeDimensions(expand_shape);
+        X = expand_layer->getOutput(0);
+      }
       if (op_type_ == "add") {
         nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
             engine_, Scale, *X, scale_mode, shift_weights.get(),
@@ -92,7 +96,17 @@ class ElementwiseWeightOpConverter : public OpConverter {
             shift_weights.get(), power_weights.get());
         layer = scale_layer;
       }
-
+      if (input_dim.nbDims < 3 + dynamic_shape_offset) {
+        nvinfer1::Dims squeeze_shape;
+        squeeze_shape.nbDims = input_dim.nbDims;
+        for (int i = 0; i < squeeze_shape.nbDims; i++) {
+          squeeze_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
+        }
+        squeeze_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
+        squeeze_layer->setReshapeDimensions(squeeze_shape);
+        layer = static_cast<nvinfer1::ILayer*>(squeeze_layer);
+      }
       auto output_name = op_desc.Output("Out")[0];
       RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
                                test_mode);
@@ -193,25 +207,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"X\").size() "
-            "should equal to 1, but received Input(\"X\").size() = %u.",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"Y\").size() "
-            "should equal to 1, but received Input(\"Y\").size() = %u.",
-            op_desc.Input("Y").size()));  // Y is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Out").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Output(\"Out\").size() "
-            "should equal to 1, but received Output(\"Out\").size() = %u.",
-            op_desc.Output("Out").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
     std::vector<nvinfer1::ITensor*> itensors;
@@ -251,7 +246,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
 #if IS_TRT_VERSION_GE(6000)
         plugin::ElementwisePluginDynamic* plugin =
             new plugin::ElementwisePluginDynamic(op_type_, axis);
-        layer = engine_->AddPluginV2(itensors.data(), 2, plugin);
+        layer = engine_->AddDynamicPlugin(itensors.data(), 2, plugin);
 #else
         PADDLE_THROW(platform::errors::Fatal(
             "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 7f8843a3f67d05465788132ac85257dcdf3c322c..66a682db07b91195046d3d11031b8739b72b81c4 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -31,16 +31,20 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
 #if IS_TRT_VERSION_GE(6000)
-    VLOG(4) << "convert fluid swish op to tensorrt layer";
+    VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
-    auto id_names = op_desc.Input("Ids");
-    auto emb_names = op_desc.Input("Embs");
+    auto word_id_name = op_desc.Input("WordId").front();
+    auto pos_id_name = op_desc.Input("PosId").front();
+    auto sent_id_name = op_desc.Input("SentId").front();
+    auto word_emb_name = op_desc.Input("WordEmbedding").front();
+    auto pos_emb_name = op_desc.Input("PosEmbedding").front();
+    auto sent_emb_name = op_desc.Input("SentEmbedding").front();
+    std::vector<std::string> id_names = {word_id_name, pos_id_name,
+                                         sent_id_name};
+    std::vector<std::string> emb_names = {word_emb_name, pos_emb_name,
+                                          sent_emb_name};
 
-    PADDLE_ENFORCE_EQ(id_names.size(), emb_names.size(),
-                      platform::errors::InvalidArgument(
-                          "The id and emb size of fused EmbEltwiseLayerNormOp "
-                          "should be same "));
     int input_num = id_names.size();
 
     // Declare inputs
@@ -89,97 +93,98 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     int64_t bias_size = framework::product(bias_dims);
     int64_t scale_size = framework::product(scale_dims);
     nvinfer1::ILayer* layer = nullptr;
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
 
-    if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss()) {
-        int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
-        PADDLE_ENFORCE_EQ(
-            output_fp16, 1,
-            platform::errors::InvalidArgument(
-                "Only Precision::KHalf(fp16) is supported when infering "
-                "ernie(bert) model with config.EnableTensorRtOSS(). "
-                "But Precision::KFloat32 is setted."));
-        const std::vector<nvinfer1::PluginField> fields{
-            {"bert_embeddings_layernorm_beta", bias,
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(bias_size)},
-            {"bert_embeddings_layernorm_gamma", scale,
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(scale_size)},
-            {"bert_embeddings_word_embeddings", input_embs[0],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[0])},
-            {"bert_embeddings_token_type_embeddings", input_embs[2],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[2])},
-            {"bert_embeddings_position_embeddings", input_embs[1],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[1])},
-            {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
-        };
-
-        // remember to free
-        nvinfer1::PluginFieldCollection* plugin_ptr =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*plugin_ptr) +
-                       fields.size() * sizeof(nvinfer1::PluginField)));
-        plugin_ptr->nbFields = static_cast<int>(fields.size());
-        plugin_ptr->fields = fields.data();
-
-        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(0)->getName()));  // word_embedding,
-                                                           // eval_placeholder_0
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(1)->getName()));  // sent_embedding,
-                                                           // eval_placeholder_1
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
-                                                           // eval_placeholder_2
-        auto max_seqlen_tensor =
-            engine_->GetITensor(engine_->network()->getInput(3)->getName());
-        auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Shuffle,
-            *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
-        nvinfer1::Dims shape_dim;
-        shape_dim.nbDims = 1;
-        shape_dim.d[0] = -1;
-        shuffle_layer->setReshapeDimensions(shape_dim);
-        plugin_inputs.emplace_back(
-            shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
-
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomEmbLayerNormPluginDynamic", "2");
-
-        auto plugin_obj = creator->createPlugin(
-            "CustomEmbLayerNormPluginDynamic", plugin_ptr);
-        auto plugin_layer = engine_->network()->addPluginV2(
-            plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
-        layer = plugin_layer;
-        free(plugin_ptr);
-        auto output_name = op_desc.Output("Out")[0];
-        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm",
-                                 {output_name, std::string("qkv_plugin_mask")},
-                                 test_mode);
-      } else {
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-        plugin::DynamicPluginTensorRT* plugin = nullptr;
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps, with_fp16);
-        layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
-        auto output_name = op_desc.Output("Out")[0];
-        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
-                                 test_mode);
+    if (engine_->use_oss()) {
+      int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
+      if (enable_int8) {
+        output_fp16 = 1;
       }
+      PADDLE_ENFORCE_EQ(
+          input_num, 3,
+          platform::errors::InvalidArgument(
+              "When using oss and var-len, embedding_eltwise_layernorm op"
+              "should have 3 inputs only, but got %d.",
+              input_num));
+      PADDLE_ENFORCE_EQ(
+          output_fp16, 1,
+          platform::errors::InvalidArgument(
+              "Only Precision::KHalf(fp16) is supported when infering "
+              "ernie(bert) model with config.EnableTensorRtOSS(). "
+              "But Precision::KFloat32 is setted."));
+      const std::vector<nvinfer1::PluginField> fields{
+          {"bert_embeddings_layernorm_beta", bias,
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(bias_size)},
+          {"bert_embeddings_layernorm_gamma", scale,
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(scale_size)},
+          {"bert_embeddings_word_embeddings", input_embs[0],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[0])},
+          {"bert_embeddings_token_type_embeddings", input_embs[2],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[2])},
+          {"bert_embeddings_position_embeddings", input_embs[1],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[1])},
+          {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
+      };
+
+      // remember to free
+      nvinfer1::PluginFieldCollection* plugin_ptr =
+          static_cast<nvinfer1::PluginFieldCollection*>(
+              malloc(sizeof(*plugin_ptr) +
+                     fields.size() * sizeof(nvinfer1::PluginField)));
+      plugin_ptr->nbFields = static_cast<int>(fields.size());
+      plugin_ptr->fields = fields.data();
+
+      std::vector<nvinfer1::ITensor*> plugin_inputs;
+      plugin_inputs.emplace_back(
+          engine_->GetITensor(word_id_name));  // word_embedding,
+                                               // eval_placeholder_0
+      plugin_inputs.emplace_back(
+          engine_->GetITensor(sent_id_name));  // sent_embedding,
+                                               // eval_placeholder_1
+      plugin_inputs.emplace_back(
+          engine_->GetITensor(pos_id_name));  // cu_seqlens,
+                                              // eval_placeholder_2
+      auto max_seqlen_tensor =
+          engine_->GetITensor(engine_->network()->getInput(3)->getName());
+      auto* shuffle_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
+      nvinfer1::Dims shape_dim;
+      shape_dim.nbDims = 1;
+      shape_dim.d[0] = -1;
+      shuffle_layer->setReshapeDimensions(shape_dim);
+      plugin_inputs.emplace_back(
+          shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+
+      auto creator = GetPluginRegistry()->getPluginCreator(
+          "CustomEmbLayerNormPluginDynamic", "2");
+
+      auto plugin_obj =
+          creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr);
+      auto plugin_layer = engine_->network()->addPluginV2(
+          plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
+      layer = plugin_layer;
+      free(plugin_ptr);
+      auto output_name = op_desc.Output("Out")[0];
+      RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm",
+                               {output_name, std::string("qkv_plugin_mask")},
+                               test_mode);
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+      plugin::DynamicPluginTensorRT* plugin = nullptr;
+      plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
+          input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
+          eps, with_fp16);
+      layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
+      auto output_name = op_desc.Output("Out")[0];
+      RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
+                               test_mode);
     }
 
 #else
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 41fbbb557d6470228b9315f8d0e0ed1e5ad905ac..d2dcd4d11bfc8fa0b1021a2481ff930527567a9f 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -37,7 +37,7 @@ class FcOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias";
     framework::OpDesc op_desc(op, nullptr);
-
+    auto output_name = op_desc.Output("Out").front();
     auto input_names = op_desc.InputNames();
     bool with_bias = input_names.size() >= 3;
     std::string w_name = "Y";
@@ -54,7 +54,7 @@ class FcOpConverter : public OpConverter {
         Y_v, platform::errors::NotFound(
                  "Can not find %s presistale var of fc in scope.", w_name));
     auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    const int x_num_col_dims =
+    int x_num_col_dims =
         op_desc.HasAttr("x_num_col_dims")
             ? BOOST_GET_CONST(int, op_desc.GetAttr("x_num_col_dims"))
             : (op_desc.HasAttr("in_num_col_dims")
@@ -106,17 +106,55 @@ class FcOpConverter : public OpConverter {
     auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output,
                          TensorRTEngine::Weight& weight,
                          TensorRTEngine::Weight& bias) {
-      auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs,
-                                            n_output, weight.get(), bias.get());
-
-      auto output_name = op_desc.Output("Out").front();
-      if (activation_type == "relu") {
-        nvinfer1::IActivationLayer* relu_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Activation, *(fc_layer->getOutput(0)),
-                                 nvinfer1::ActivationType::kRELU);
-        RreplenishLayerAndOutput(relu_layer, "fc", {output_name}, test_mode);
+      if (enable_int8) {
+        // add conv layer
+        PADDLE_ENFORCE_EQ(
+            op_desc.HasAttr("out_threshold"), true,
+            platform::errors::InvalidArgument(
+                "must have out threshold in fc layers in int8 mode"));
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        nvinfer1::DimsHW nv_ksize(1, 1);
+        auto* fc_layer_int8 =
+            TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output,
+                                 nv_ksize, weight.get(), bias.get());
+        engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale);
+        if (activation_type == "relu") {
+          nvinfer1::IActivationLayer* relu_layer_int8 = TRT_ENGINE_ADD_LAYER(
+              engine_, Activation, *(fc_layer_int8->getOutput(0)),
+              nvinfer1::ActivationType::kRELU);
+          RreplenishLayerAndOutput(relu_layer_int8, "relu_after_fc_shuffle",
+                                   {output_name}, test_mode);
+        } else {
+          RreplenishLayerAndOutput(fc_layer_int8, "shuffle_after_fc",
+                                   {output_name}, test_mode);
+        }
       } else {
-        RreplenishLayerAndOutput(fc_layer, "fc", {output_name}, test_mode);
+        // add fc layer
+        auto* fc_layer_before =
+            TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, n_output,
+                                 weight.get(), bias.get());
+        fc_layer_before->setName(
+            ("fc_layer_before(Output: " + output_name + ")").c_str());
+        // add shuffle after fc
+        nvinfer1::Dims reshape_after_fc_dim;
+        reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
+        for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
+          reshape_after_fc_dim.d[i] = 0;
+        }
+        auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(
+            engine_, Shuffle, *fc_layer_before->getOutput(0));
+        fc_layer_float->setReshapeDimensions(reshape_after_fc_dim);
+        if (activation_type == "relu") {
+          nvinfer1::IActivationLayer* relu_layer_float = TRT_ENGINE_ADD_LAYER(
+              engine_, Activation, *(fc_layer_float->getOutput(0)),
+              nvinfer1::ActivationType::kRELU);
+          RreplenishLayerAndOutput(relu_layer_float, "relu_after_fc_shuffle",
+                                   {output_name}, test_mode);
+        } else {
+          RreplenishLayerAndOutput(fc_layer_float, "shuffle_after_fc",
+                                   {output_name}, test_mode);
+        }
       }
     };
 
@@ -143,71 +181,43 @@ class FcOpConverter : public OpConverter {
                                 static_cast<void*>(bias_data),
                                 static_cast<size_t>(bias_num)};
 
-    if (engine_->with_dynamic_shape()) {
-      regist_fc(X, n_output, weight, bias);
-      return;
+    auto x_dim = X->getDimensions();
+    // Running the TRT Static Shape mode: x_num_col_dims-1
+    if (!engine_->with_dynamic_shape()) {
+      x_num_col_dims--;
     }
-    // in order to handle situations in NLP models(input dims < 3,
-    // x_num_col_dims != 1, etc.), reshape input to perform FC correctly.
-    auto* reshape_itensor = X;
-    int input_dims = X->getDimensions().nbDims;
-    auto input_d = X->getDimensions().d;
-    int reshape_dim3[3] = {0};
-    int reshape_dim4[4] = {0};
-    PADDLE_ENFORCE_EQ(
-        x_num_col_dims == 1 || x_num_col_dims == 2, true,
+    PADDLE_ENFORCE_GT(
+        x_dim.nbDims, x_num_col_dims,
         platform::errors::InvalidArgument(
-            "Wrong x_num_col_dims param of op mul. Paddle-TRT FC converter "
-            "expects x_num_col_dims is either 1 or 2, but got %d",
-            x_num_col_dims));
-    PADDLE_ENFORCE_LE(x_num_col_dims, input_dims,
-                      platform::errors::InvalidArgument(
-                          "Params and input dims mismatch. Paddle-TRT FC "
-                          "converter expects x_num_col_dims <= input dims"));
-    if (x_num_col_dims == 1) {
-      if (input_dims == 4) {
-        PADDLE_ENFORCE_EQ(
-            input_d[3], 1,
-            platform::errors::InvalidArgument(
-                "Invalid dimensions. When x_num_col_dims equals to 1 and input "
-                "dims equals to 4, the last dim of input must be 1, but got %d",
-                input_d[3]));
-      }
-      for (int i = 0; i < 3; i++) {
-        if (i < input_dims) {
-          reshape_dim3[i] = input_d[i];
-        } else {
-          reshape_dim3[i] = 1;
-        }
-      }
-      nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1],
-                                  reshape_dim3[2]);
-      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
-      reshape_layer->setReshapeDimensions(reshape_dim);
-      reshape_itensor = reshape_layer->getOutput(0);
-      if (enable_int8) {
-        engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
-      }
-    } else {
-      PADDLE_ENFORCE_NE(input_dims, 1,
-                        platform::errors::InvalidArgument(
-                            "Invalid dimensions. When x_num_col_dims equals to "
-                            "2, input_dims should not be 1"));
-      for (int i = 0; i < 4; i++) {
-        if (i < input_dims) {
-          reshape_dim4[i] = input_d[i];
-        } else {
-          reshape_dim4[i] = 1;
+            "Params and input dims mismatch. Paddle-TRT FC "
+            "converter expects x_dim.nbDims > x_num_col_dims, but "
+            "x_dim.nbDims : %d, x_num_col_dims : %d.",
+            x_dim.nbDims, x_num_col_dims));
+    // add shuffle before fc
+    nvinfer1::Dims reshape_before_fc_dim;
+    reshape_before_fc_dim.nbDims = x_num_col_dims + 3;
+    // padding shape "* x q x 1 x 1"
+    for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
+      reshape_before_fc_dim.d[i] = 1;
+    }
+    for (int i = 0; i < x_dim.nbDims; i++) {
+      if (i < x_num_col_dims) {
+        reshape_before_fc_dim.d[i] = 0;
+      } else {
+        if (x_dim.d[i] < 0) {
+          reshape_before_fc_dim.d[x_num_col_dims] = -1;
+          break;
         }
+        reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i];
       }
-      nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1],
-                                  reshape_dim4[2], reshape_dim4[3]);
-      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
-      reshape_layer->setReshapeDimensions(reshape_dim);
-      reshape_itensor = reshape_layer->getOutput(0);
-      if (enable_int8) {
-        engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
-      }
+    }
+    auto* reshape_before_fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+    reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+    reshape_before_fc_layer->setName(
+        ("shuffle_before_fc(Output: " + output_name + ")").c_str());
+    auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
+    if (enable_int8) {
+      engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
     }
     regist_fc(reshape_itensor, n_output, weight, bias);
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/gather_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..346a8bffa00e383781a2e0a26afaa97437598b8d
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/gather_op.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Gather Op
+ */
+class GatherOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid gather op to tensorrt gather layer";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("X").front();
+    std::string index_name = op_desc.Input("Index").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    const auto input_tensor = engine_->GetITensor(input_name);
+    const auto index_tensor = engine_->GetITensor(index_name);
+
+    const int axis = 0;
+
+    auto layer = TRT_ENGINE_ADD_LAYER(engine_, Gather, *input_tensor,
+                                      *index_tensor, axis);
+
+    auto odim = layer->getOutput(0)->getDimensions();
+
+    auto reshape_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
+
+    nvinfer1::Dims target_shape{};
+    target_shape.nbDims = odim.nbDims - 1;
+    for (int i = 0; i < axis; ++i) {
+      target_shape.d[i] = odim.d[i];
+    }
+    target_shape.d[axis] = 0;
+    for (int i = axis + 1; i < target_shape.nbDims; ++i) {
+      target_shape.d[i] = odim.d[i + 1];
+    }
+
+    reshape_layer->setReshapeDimensions(target_shape);
+
+    RreplenishLayerAndOutput(reshape_layer, "gather", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(gather, GatherOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index 4c9996ca02cad48950c2c68763e2c0270cd1f9e4..0436499cd40756150d5b33c6d685d74ffbe5b87d 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -47,15 +47,7 @@ class GeluOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1,
-                      platform::errors::InvalidArgument(
-                          "gelu op has only 1 input, but got %d", input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1,
-                      platform::errors::InvalidArgument(
-                          "gelu op has only 1 output, but got %d", output_num));
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
@@ -64,7 +56,7 @@ class GeluOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::GeluPluginDynamic* plugin =
           new plugin::GeluPluginDynamic(with_fp16);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
index 9dc40ceec4809489bc308485934150e8d1491c83..7ef79e547d09ab678fd1ce43c301bd893ea4e822 100644
--- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
@@ -41,17 +41,7 @@ class HardSwishOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(
-        input_num, 1,
-        platform::errors::InvalidArgument(
-            "HardSwish op has only 1 input, but got %d", input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(
-        output_num, 1,
-        platform::errors::InvalidArgument(
-            "HardSwish op has only 1 output, but got %d", output_num));
 
     const float threshold =
         op_desc.HasAttr("threshold")
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index c1f266bacfec5784d6f641574ce20f91496f2f35..0b97b5d87a3d506e9e14ea5780a9e7b4ac471c83 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -25,25 +25,6 @@ class LayerNormOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert a fluid layer_norm op to tensorrt layer_norm plugin";
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "input of layer_norm op converter should be 1, got %d",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Bias of layer_norm op converter should be 1, got %d",
-                          op_desc.Input("Bias").size()));  // Bias is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Scale").size(), 1,
-        platform::errors::InvalidArgument(
-            "Scale of layer_norm op converter should be 1, got %d",
-            op_desc.Input("Scale").size()));  // Scale is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "output of layer_norm op converter should be 1, got %d",
-            op_desc.Input("Y").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index c2ffb3f3197c15d24d317f2fc3290f650e753d7a..d6277b5208d5a1588cddc366e66995046f77f3fd 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -36,21 +36,7 @@ class LeakyReluOpConverter : public OpConverter {
     VLOG(4) << "convert fluid leaky_relu op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    size_t input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid number of TRT leaky_relu op converter "
-                          "inputs. Expected 1, but received %d",
-                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid number of TRT leaky_relu op converter "
-                          "outputs. Expected 1, but received %d",
-                          output_num));
     // Get attrs
     float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
     nvinfer1::ILayer* output_layer = nullptr;
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b0d67a5bf90ca9fcad742367a4c1a3c2c3eb0ee2
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class MultiClassNMSOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid multiclassNMS op to tensorrt plugin";
+
+    // for now, only work for static shape and regular tensor
+    framework::OpDesc op_desc(op, nullptr);
+
+    std::string bboxes = op_desc.Input("BBoxes").front();
+    std::string scores = op_desc.Input("Scores").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    auto* bboxes_tensor = engine_->GetITensor(bboxes);
+    auto* scores_tensor = engine_->GetITensor(scores);
+
+    int background_label =
+        BOOST_GET_CONST(int, op_desc.GetAttr("background_label"));
+    float score_threshold =
+        BOOST_GET_CONST(float, op_desc.GetAttr("score_threshold"));
+    int nms_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("nms_top_k"));
+    float nms_threshold =
+        BOOST_GET_CONST(float, op_desc.GetAttr("nms_threshold"));
+    int keep_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("keep_top_k"));
+    bool normalized = BOOST_GET_CONST(bool, op_desc.GetAttr("normalized"));
+    int num_classes = scores_tensor->getDimensions().d[0];
+
+    auto bboxes_dims = bboxes_tensor->getDimensions();
+    nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]);
+    auto* bboxes_expand_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor);
+    bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims);
+
+    nvinfer1::Permutation permutation{1, 0};
+    auto* scores_transpose_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor);
+    scores_transpose_layer->setFirstTranspose(permutation);
+
+    std::vector<nvinfer1::ITensor*> batch_nms_inputs;
+    batch_nms_inputs.push_back(bboxes_expand_layer->getOutput(0));
+    batch_nms_inputs.push_back(scores_transpose_layer->getOutput(0));
+
+    constexpr bool shareLocation = true;
+    constexpr bool clip_boxes = false;
+
+    const std::vector<nvinfer1::PluginField> fields{
+        {"shareLocation", &shareLocation, nvinfer1::PluginFieldType::kINT32, 1},
+        {"backgroundLabelId", &background_label,
+         nvinfer1::PluginFieldType::kINT32, 1},
+        {"numClasses", &num_classes, nvinfer1::PluginFieldType::kINT32, 1},
+        {"topK", &nms_top_k, nvinfer1::PluginFieldType::kINT32, 1},
+        {"keepTopK", &keep_top_k, nvinfer1::PluginFieldType::kINT32, 1},
+        {"scoreThreshold", &score_threshold,
+         nvinfer1::PluginFieldType::kFLOAT32, 1},
+        {"iouThreshold", &nms_threshold, nvinfer1::PluginFieldType::kFLOAT32,
+         1},
+        {"isNormalized", &normalized, nvinfer1::PluginFieldType::kINT32, 1},
+        {"clipBoxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1},
+    };
+
+    nvinfer1::PluginFieldCollection* plugin_collections =
+        static_cast<nvinfer1::PluginFieldCollection*>(
+            malloc(sizeof(*plugin_collections) +
+                   fields.size() * sizeof(nvinfer1::PluginField)));
+    plugin_collections->nbFields = static_cast<int>(fields.size());
+    plugin_collections->fields = fields.data();
+
+    auto creator = GetPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1");
+    auto batch_nms_plugin =
+        creator->createPlugin("BatchNMSPlugin", plugin_collections);
+    free(plugin_collections);
+
+    auto batch_nms_layer = engine_->network()->addPluginV2(
+        batch_nms_inputs.data(), batch_nms_inputs.size(), *batch_nms_plugin);
+    auto nmsed_boxes = batch_nms_layer->getOutput(1);
+    auto nmsed_scores = batch_nms_layer->getOutput(2);
+    auto nmsed_classes = batch_nms_layer->getOutput(3);
+
+    auto nmsed_scores_transpose_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_scores);
+    nmsed_scores_transpose_layer->setReshapeDimensions(
+        nvinfer1::Dims2(keep_top_k, 1));
+    auto nmsed_classes_reshape_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_classes);
+    nmsed_classes_reshape_layer->setReshapeDimensions(
+        nvinfer1::Dims2(keep_top_k, 1));
+
+    std::vector<nvinfer1::ITensor*> concat_inputs;
+    concat_inputs.push_back(nmsed_classes_reshape_layer->getOutput(0));
+    concat_inputs.push_back(nmsed_scores_transpose_layer->getOutput(0));
+    concat_inputs.push_back(nmsed_boxes);
+
+    auto nms_concat_layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Concatenation, concat_inputs.data(), concat_inputs.size());
+    nms_concat_layer->setAxis(1);
+
+    RreplenishLayerAndOutput(nms_concat_layer, "multiclass_nms", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(multiclass_nms, MultiClassNMSOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index ee04fd372c4588b492948a5acd23493d2ba423c5..f2f45c694ab44fb03cfd6b018ef0a0a1ae6f0a31 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
@@ -28,7 +28,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
                "network structure";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
-    // Shouble be a 5 dims tensor.
     auto* input = engine_->GetITensor(op_desc.Input("Input").front());
 
     // fc weights and fc bias
@@ -41,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter {
     auto* bias_v = scope.FindVar(bias_name);
     auto* bias_t = bias_v->GetMutable<framework::LoDTensor>();
 
-    float* weight_data =
-        engine_->GetWeightCPUData(weight_name, weight_t, false);
+    float* weight_data = nullptr;
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
+    float in_scale = 0.;
+
+    if (enable_int8) {
+      PADDLE_ENFORCE_EQ(
+          op_desc.HasAttr("Input_scale"), true,
+          platform::errors::InvalidArgument(
+              "must have input scale in multihead layers in int8 mode"));
+      in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
+      auto weight_scale =
+          BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("weight_scale"));
+      weight_data =
+          engine_->GetWeightCPUData(weight_name, weight_t, true, weight_scale);
+      engine_->SetTensorDynamicRange(input, in_scale);
+    } else {
+      weight_data = engine_->GetWeightCPUData(weight_name, weight_t, false);
+    }
+
     float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t, false);
     std::vector<float> weight_data_tmp;
     weight_data_tmp.reserve(weight_t->numel());
@@ -69,6 +85,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
     int head_number = BOOST_GET_CONST(int, op_desc.GetAttr("head_number"));
 
     nvinfer1::ILayer* layer = nullptr;
+    auto output_name = op_desc.Output("Out")[0];
 
     if (engine_->with_dynamic_shape()) {
       if (engine_->use_oss()) {
@@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                static_cast<void*>(bias_data),
                                static_cast<int32_t>(bias_t->numel())};
 
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
-                                              n, weight, bias);
+        nvinfer1::ILayer* fc_layer = nullptr;
+        float dp_probs = 1.0 / 127.0;
+        if (enable_int8) {
+          nvinfer1::DimsHW nv_ksize(1, 1);
+          fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n,
+                                          nv_ksize, weight, bias);
+        } else {
+          fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
+                                          weight, bias);
+        }
+
+        if (enable_int8) {
+          PADDLE_ENFORCE_EQ(
+              op_desc.HasAttr("out_threshold"), true,
+              platform::errors::InvalidArgument(
+                  "must have out threshold in multihead layers in int8 mode"));
+          float out_scale =
+              BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+          dp_probs = out_scale / 127.0;
+        }
 
         auto mask_tensor = engine_->GetITensor("qkv_plugin_mask");
 
@@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter {
         int type = static_cast<int>((engine_->WithFp16() == 1)
                                         ? nvinfer1::DataType::kHALF
                                         : nvinfer1::DataType::kFLOAT);
+        if (enable_int8) {
+          type = static_cast<int>(nvinfer1::DataType::kHALF);
+        }
         bool has_mask = true;
         int var_seqlen = 1;
         const std::vector<nvinfer1::PluginField> fields{
@@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
             {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
             {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
             {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1},
-        };
+            { "dq_probs", &dp_probs, nvinfer1::PluginFieldType::kFLOAT32, 1 }};
         nvinfer1::PluginFieldCollection* plugin_collection =
             static_cast<nvinfer1::PluginFieldCollection*>(
                 malloc(sizeof(*plugin_collection) +
@@ -171,6 +210,12 @@ class MultiheadMatMulOpConverter : public OpConverter {
             plugin_inputs.data(), plugin_inputs.size(), *plugin);
         layer = plugin_layer;
       } else {
+        PADDLE_ENFORCE_EQ(
+            input->getDimensions().nbDims, 3,
+            platform::errors::InvalidArgument(
+                "The Input dim of the MultiheadMatMul should be 3, "
+                "but it's (%d) now.",
+                input->getDimensions().nbDims));
         // transpose weight_data from m * n to  n * m
         auto* input_bias_qk =
             engine_->GetITensor(op_desc.Input("BiasQK").front());
@@ -184,22 +229,44 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                     static_cast<void*>(bias_data),
                                     static_cast<size_t>(bias_t->numel())};
 
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
-                                              n, weight.get(), bias.get());
-        auto* fc_out = fc_layer->getOutput(0);
+        // add shuffle before fc
+        nvinfer1::Dims reshape_before_fc_dim;
+        reshape_before_fc_dim.nbDims = 5;
+        reshape_before_fc_dim.d[0] = 0;
+        reshape_before_fc_dim.d[1] = 0;
+        reshape_before_fc_dim.d[2] = 0;
+        reshape_before_fc_dim.d[3] = 1;
+        reshape_before_fc_dim.d[4] = 1;
+        auto* reshape_before_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+        reshape_before_fc_layer->setName(
+            ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
+                .c_str());
+
+        // add layer fc
+        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n,
+            weight.get(), bias.get());
+        fc_layer->setName(
+            ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
+
+        // no need to add shuffle after fc, just change it in
+        // QkvToContextPluginDynamic
+
         // add qkv to context
         int head_size = hidden_out / head_number;
         float scale = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
 
         std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.push_back(fc_out);
+        plugin_inputs.push_back(fc_layer->getOutput(0));
         plugin_inputs.push_back(input_bias_qk);
         bool with_fp16 =
             engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
         plugin::DynamicPluginTensorRT* plugin =
             new plugin::QkvToContextPluginDynamic(hidden_in, head_number,
                                                   head_size, scale, with_fp16);
-        layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin);
+        layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
       }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
@@ -208,7 +275,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
           "You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
           "the shape information to run the dynamic shape mode."));
     }
-    auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name},
                              test_mode);
 #else
diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3940cc5dce1b0003eb947e63706b9ebd0463ef6a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class NearestInterpolateOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid nearest_interp op";
+
+    framework::OpDesc op_desc(op, nullptr);
+
+    std::string input_name = op_desc.Input("X").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    auto input = engine_->GetITensor(input_name);
+
+    auto data_layout = framework::StringToDataLayout(
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout")));
+    auto interp_method =
+        BOOST_GET_CONST(std::string, op_desc.GetAttr("interp_method"));
+    bool align_corners =
+        BOOST_GET_CONST(bool, op_desc.GetAttr("align_corners"));
+
+    auto input_names = op_desc.Input("X");
+    auto scale = BOOST_GET_CONST(float, op_desc.GetAttr("scale"));
+    auto out_h = BOOST_GET_CONST(int, op_desc.GetAttr("out_h"));
+    auto out_w = BOOST_GET_CONST(int, op_desc.GetAttr("out_w"));
+
+    auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input);
+    layer->setAlignCorners(align_corners);
+
+    auto in_dim = input->getDimensions();
+
+    float scale_h = 1.f;
+    float scale_w = 1.f;
+
+    std::vector<float> scales;
+
+    if (scale > 0.f && (out_h <= 0 && out_w <= 0)) {
+      scale_h = scale;
+      scale_w = scale;
+    } else {
+      // axis are different in static/dynamic mode
+      bool with_dynamic = engine_->with_dynamic_shape();
+
+      int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic;
+      int w_axis =
+          (data_layout == framework::DataLayout::kNCHW) + 1 + with_dynamic;
+
+      scale_h =
+          static_cast<float>(out_h) / static_cast<float>(in_dim.d[h_axis]);
+      scale_w =
+          static_cast<float>(out_w) / static_cast<float>(in_dim.d[w_axis]);
+    }
+
+    if (engine_->with_dynamic_shape()) {
+      scales.push_back(1.f);
+    }
+
+    if (data_layout == framework::DataLayout::kNCHW) {
+      scales.push_back(1.f);
+      scales.push_back(scale_h);
+      scales.push_back(scale_w);
+    } else if (data_layout == framework::DataLayout::kNHWC) {
+      // NHWC
+      scales.push_back(scale_h);
+      scales.push_back(scale_w);
+      scales.push_back(1.f);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Data layout must be NCHW or NHWC."));
+    }
+    layer->setScales(scales.data(), scales.size());
+
+    RreplenishLayerAndOutput(layer, "nearest_interp", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(nearest_interp, NearestInterpolateOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 8de16df0a2f610b30da389bc73e122074d66471e..f72ae2c3ec2d7e013247f294a6f3e6dd4572ae35 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -109,6 +109,12 @@ class OpConverter {
           it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                               op_desc.Type()));
     }
+    if (op_desc.Type() == "depthwise_conv2d_transpose") {
+      it = Registry<OpConverter>::Global().Lookup("conv2d_transpose");
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+    }
     if (op_desc.Type() == "transpose2") {
       it = Registry<OpConverter>::Global().Lookup("transpose");
       PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index 6bf50e4742dd28af1ad96670f2d15406bbb3987e..d6711bbbd2cb52fc4508f100ab1e5f1781cc4177 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -43,8 +43,6 @@ class PadOpConverter : public OpConverter {
 
     const std::vector<int> paddings =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
-    const float pad_value =
-        BOOST_GET_CONST(float, op_desc.GetAttr("pad_value"));
 
     nvinfer1::Dims input_shape = input->getDimensions();
     int nbDims = input_shape.nbDims;
@@ -62,9 +60,6 @@ class PadOpConverter : public OpConverter {
                                           "(nbDims + 1) * 2 == pad_size. But "
                                           "received nbDims:%d, pad_size:%d.",
                                           nbDims, pad_size));
-    PADDLE_ENFORCE_EQ(pad_value, 0.0,
-                      platform::errors::InvalidArgument(
-                          "The pad layer of TRT only support zero."));
 
     nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
     nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index aa4e54b58457227330e22c2a3e29868fe6215de6..90d6392fd6404ef8b46e3ae6783c24691995fa00 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -66,15 +66,6 @@ class Pool2dOpConverter : public OpConverter {
     VLOG(4)
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "TRT Pool2d expect 1 input, but got %d input.",
-                          op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "TRT Pool2d expect 1 Output, but got %d output.",
-                          op_desc.Output("Out").size()));
-
     auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
     nvinfer1::Dims input_shape = input1->getDimensions();
     int input_dims = input_shape.nbDims;
@@ -110,10 +101,6 @@ class Pool2dOpConverter : public OpConverter {
       nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
       reduce_operation = nvinfer1::ReduceOperation::kAVG;
       plugin_pool_type = plugin::PoolPlugin::PoolType::avg;
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Wrong pool op type, the trt do not support the %s pool type.",
-          pool_type));
     }
 
     nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
@@ -147,7 +134,7 @@ class Pool2dOpConverter : public OpConverter {
         plugin::PoolPluginDynamic *plugin =
             new plugin::PoolPluginDynamic(ceil_mode, pool_type, adaptive, ksize,
                                           strides, paddings, global_pooling);
-        layer = engine_->AddPluginV2(&input1, 1, plugin);
+        layer = engine_->AddDynamicPlugin(&input1, 1, plugin);
 #endif
       }
       auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 5e881ecbbc4e2cc8e81b9334dc827513bfad02eb..a8a36e1238168ad368a02bf2ebed915939c3d5c1 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -31,19 +31,7 @@ class PReluOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     size_t input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of prelu TRT converter. "
-                          "Expected 1, received %d.",
-                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid output Out's size of prelu TRT converter. "
-                          "Expected 1, received %d.",
-                          output_num));
     // Get attrs
     std::string mode = BOOST_GET_CONST(std::string, op_desc.GetAttr("mode"));
     //
@@ -65,7 +53,7 @@ class PReluOpConverter : public OpConverter {
 #if IS_TRT_VERSION_GE(6000)
       plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic(
           alpha_data, alpha_tensor_temp->numel(), mode);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..654fe7e013379669c0d67e8690215b6eaca18443
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Roi Align Op
+ */
+class RoiAlignOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid roi align op to tensorrt plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("X").front();
+    std::string rois_name = op_desc.Input("ROIs").front();
+    std::string output_name = op_desc.Output("Out").front();
+
+    const auto pooled_height =
+        BOOST_GET_CONST(int, op_desc.GetAttr("pooled_height"));
+    const auto pooled_width =
+        BOOST_GET_CONST(int, op_desc.GetAttr("pooled_width"));
+    const auto spatial_scale =
+        BOOST_GET_CONST(float, op_desc.GetAttr("spatial_scale"));
+    const auto sampling_ratio =
+        BOOST_GET_CONST(int, op_desc.GetAttr("sampling_ratio"));
+
+    const auto input_tensor = engine_->GetITensor(input_name);
+    const auto rois_tensor = engine_->GetITensor(rois_name);
+
+    const nvinfer1::DataType data_type_ = engine_->WithFp16()
+                                              ? nvinfer1::DataType::kHALF
+                                              : nvinfer1::DataType::kFLOAT;
+
+    std::vector<nvinfer1::ITensor*> inputs{input_tensor, rois_tensor};
+    nvinfer1::ILayer* layer = nullptr;
+
+    auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic(
+        data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio);
+    auto roi_align_layer = engine_->network()->addPluginV2(
+        inputs.data(), inputs.size(), *roi_align_plugin);
+    layer = roi_align_layer;
+
+    std::vector<std::string> output_names{output_name};
+    RreplenishLayerAndOutput(layer, "roi_align", output_names, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(roi_align, RoiAlignOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
index bf1f82076a66ce4a17ce7966bc0974d3de507008..0fdc262f7e740bc577bdb21a457d4288fcf7bf94 100644
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
@@ -50,12 +50,6 @@ class ShuffleChannelOpConverter : public OpConverter {
     int w = input_dims.d[2];
     int group = BOOST_GET_CONST(int, op_desc.GetAttr("group"));
 
-    if (engine_->with_dynamic_shape()) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, "
-          "the shuffle_channel op does not support dynamic shape yet"));
-    }
-
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
     nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
     layer->setReshapeDimensions(reshape_dim);
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 2e4a4e6120d2d835798f646b9c60b4fe2dbebf8e..e621ac0514109d40295cb402f1803b17da39bc87 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -49,55 +49,60 @@ class SkipLayerNormOpConverter : public OpConverter {
     auto* scale = get_persistable_data("Scale", &scale_dims);
     int bias_size = framework::product(bias_dims);
     int scale_size = framework::product(scale_dims);
+    bool enable_int8 = op_desc.HasAttr("enable_int8");
 
     nvinfer1::ILayer* layer = nullptr;
-    if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss()) {
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomSkipLayerNormPluginDynamic", "2");
-        assert(creator != nullptr);
-        int type = static_cast<int>((engine_->WithFp16() == 1)
-                                        ? nvinfer1::DataType::kHALF
-                                        : nvinfer1::DataType::kFLOAT);
-        int ld = input1->getDimensions().d[2];  // hidden dimension
-        assert(ld > 0);
-
-        const std::vector<nvinfer1::PluginField> fields{
-            {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
-            {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
-            {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
-            {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
-        };
-        nvinfer1::PluginFieldCollection* pluginPtr =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*pluginPtr) +
-                       fields.size() *
-                           sizeof(nvinfer1::PluginField)));  // remember to free
-        pluginPtr->nbFields = static_cast<int>(fields.size());
-        pluginPtr->fields = fields.data();
-
-        auto pluginObj = creator->createPlugin(
-            "CustomSkipLayerNormPluginDynamic", pluginPtr);
-        auto plugin_layer = engine_->network()->addPluginV2(
-            inputs.data(), inputs.size(), *pluginObj);
-
-        assert(plugin_layer != nullptr);
-        layer = plugin_layer;
-      } else {
-        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-        plugin::SkipLayerNormPluginDynamic* plugin =
-            new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
-                                                   scale_size, eps, with_fp16);
-        layer = engine_->AddPluginV2(inputs.data(), 2, plugin);
+
+    if (engine_->use_oss()) {
+      auto creator = GetPluginRegistry()->getPluginCreator(
+          "CustomSkipLayerNormPluginDynamic", "2");
+      PADDLE_ENFORCE_NE(
+          creator, nullptr,
+          platform::errors::InvalidArgument(
+              "fail to get creator of CustomSkipLayerNormPluginDynamic"));
+      int type = static_cast<int>((engine_->WithFp16() == 1)
+                                      ? nvinfer1::DataType::kHALF
+                                      : nvinfer1::DataType::kFLOAT);
+      int ld = input1->getDimensions().d[2];  // hidden dimension
+      PADDLE_ENFORCE_GT(ld, 0, platform::errors::InvalidArgument(
+                                   "in CustomSkipLayerNormPluginDynamic hidden "
+                                   "dimension should > 0"));
+      if (enable_int8) {
+        type = static_cast<int>(nvinfer1::DataType::kHALF);
       }
+
+      const std::vector<nvinfer1::PluginField> fields{
+          {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
+          {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
+          {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
+          {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
+      };
+      nvinfer1::PluginFieldCollection* pluginPtr =
+          static_cast<nvinfer1::PluginFieldCollection*>(
+              malloc(sizeof(*pluginPtr) +
+                     fields.size() *
+                         sizeof(nvinfer1::PluginField)));  // remember to free
+      pluginPtr->nbFields = static_cast<int>(fields.size());
+      pluginPtr->fields = fields.data();
+
+      auto pluginObj =
+          creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr);
+      auto plugin_layer = engine_->network()->addPluginV2(
+          inputs.data(), inputs.size(), *pluginObj);
+
+      PADDLE_ENFORCE_NE(
+          plugin_layer, nullptr,
+          platform::errors::InvalidArgument(
+              "fail to add CustomSkipLayerNormPluginDynamic layer"));
+      layer = plugin_layer;
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+      float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      plugin::SkipLayerNormPluginDynamic* plugin =
+          new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
+                                                 scale_size, eps, with_fp16);
+      layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
     }
 
     auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index 0bd2b8c9bf5eef2d2a9b45227cf09ae76ce3bb9a..2ab024dff327fda45faab01afbfbe38bb7244f93 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter {
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
 
+    if (op_desc.HasAttr("out_threshold")) {
+      float out_scale =
+          BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+      engine_->SetTensorDynamicRange(input, out_scale);
+    }
+
     std::vector<int> axes =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("axes"));
     std::vector<int> starts =
@@ -38,15 +44,6 @@ class SliceOpConverter : public OpConverter {
     std::vector<int> ends =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));
 
-    PADDLE_ENFORCE_EQ(
-        starts.size(), axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of starts must be equal to the size of axes."));
-    PADDLE_ENFORCE_EQ(
-        ends.size(), axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of ends must be equal to the size of axes."));
-
     auto input_dims = input->getDimensions();
     if (!engine_->with_dynamic_shape()) {
       // notice that input shape is [CHW] without batch axis when input has
@@ -56,10 +53,6 @@ class SliceOpConverter : public OpConverter {
       }
       input_dims.d[0] = 1;  // fake batchsize, not useful here
       for (size_t i = 0; i < axes.size(); i++) {
-        // split on batch is not supported in TensorRT
-        PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument(
-                                          "Invalid slice axis. Slice on batch "
-                                          "axis is not supported in TensorRT"));
         if (starts[i] < 0) {
           starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0);
         }
@@ -90,14 +83,14 @@ class SliceOpConverter : public OpConverter {
         // bool ban_fp16 = engine_->disable_trt_plugin_fp16();
         plugin::SpecialSlicePluginDynamic* plugin =
             new plugin::SpecialSlicePluginDynamic();
-        layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
-                                     plugin);
+        layer = engine_->AddDynamicPlugin(plugin_inputs.data(),
+                                          plugin_inputs.size(), plugin);
       } else {
         bool with_fp16 =
             engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
         plugin::SlicePluginDynamic* plugin =
             new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16);
-        layer = engine_->AddPluginV2(&input, 1, plugin);
+        layer = engine_->AddDynamicPlugin(&input, 1, plugin);
       }
 #else
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 79992065a22407ce09d7b64ab622eae993d36e9f..9cefb24751e18dfbb3b8283152cbcd58c81adc58 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -51,6 +51,7 @@ class SoftMaxOpConverter : public OpConverter {
     uint32_t axes = std::max(0, input_dims - 3);
     // TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers
     // support Nd.
+    // Tips: Dynammic shape alreay fixes.
     int padded_dims = 0;
     int explicit_batch = 0;
     if (engine_->with_dynamic_shape()) explicit_batch = 1;
@@ -62,16 +63,16 @@ class SoftMaxOpConverter : public OpConverter {
       }
     }
     if (!engine_->with_dynamic_shape()) {
-      if (axis == -1) {
-        axes = input_dims - 1 - padded_dims;
+      if (axis < 0) {
+        axes = input_dims + axis - padded_dims;
       } else {
-        axes = axis;
+        axes = axis - 1;
       }
     } else {
-      if (axis == -1) {
-        axes = input_dims - 1 - padded_dims;
+      if (axis < 0) {
+        axes = input_dims + axis;
       } else {
-        axes = axis + 1;
+        axes = axis;
       }
     }
     layer->setAxes(1 << axes);
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 768c6efaa6bd40529a509698e186fa66c2e8e711..47a6dd783a70cf7b4a8c3d7beb988fbc0f6a8786 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -33,17 +33,7 @@ class SplitOpConverter : public OpConverter {
     size_t output_num = op_desc.Output("Out").size();
 
     // Get Attrs
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of split TRT converter. "
-                          "Expected 1, received %d.",
-                          input_num));
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    // split on batch is not supported in TensorRT
-    PADDLE_ENFORCE_NE(
-        axis, 0,
-        platform::errors::InvalidArgument(
-            "Invalid split axis. Split on batch is not supported in TensorRT"));
 
     std::vector<int> output_lengths =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("sections"));
@@ -90,7 +80,7 @@ class SplitOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SplitPluginDynamic* plugin =
           new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
@@ -101,7 +91,7 @@ class SplitOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SplitPlugin* plugin =
           new plugin::SplitPlugin(axis, output_lengths, with_fp16);
-      layer = engine_->AddPlugin(&input, input_num, plugin);
+      layer = engine_->AddPluginV2Ext(&input, input_num, plugin);
     }
 
     std::string layer_name = "split (Output: ";
diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
index 1c971fa12e27e8706f5f68a3b1a5ffb34dbd4f40..6105e10799e5527e7c238b35bd8bb60f34e8d56f 100644
--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter {
 
     for (int i = 0; i < input_num; ++i) {
       inputs[i] = engine_->GetITensor(input[i]);
+      if (op_desc.HasAttr("out_threshold")) {
+        float out_scale =
+            BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold"));
+        engine_->SetTensorDynamicRange(inputs[i], out_scale);
+      }
     }
 
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
@@ -53,26 +58,19 @@ class StackOpConverter : public OpConverter {
     }
 
     nvinfer1::ILayer* layer = nullptr;
-    if (engine_->with_dynamic_shape()) {
 #if IS_TRT_VERSION_GE(6000)
-      bool with_fp16 =
-          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-      plugin::StackPluginDynamic* plugin =
-          new plugin::StackPluginDynamic(axis, input_num, with_fp16);
-      layer = engine_->AddPluginV2(inputs, input_num, plugin);
-      assert(layer != nullptr);
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+    plugin::StackPluginDynamic* plugin =
+        new plugin::StackPluginDynamic(axis, input_num, with_fp16);
+    layer = engine_->AddDynamicPlugin(inputs, input_num, plugin);
+    PADDLE_ENFORCE_NOT_NULL(
+        layer, platform::errors::InvalidArgument(
+                   "trt stack layer in converter could not be created."));
 #else
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
+    PADDLE_THROW(platform::errors::Fatal(
+        "You are running the TRT Dynamic Shape mode, need to confirm that "
+        "your TRT version is no less than 6.0"));
 #endif
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
-    }
     auto output_name = op_desc.Output("Y").front();
     RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode);
     free(inputs);
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index 25944a2fead6cdb3862abf9b33fb1bb57fa48953..b2e394d14eba23f025cf3f729fa0c815231110f5 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -65,7 +65,7 @@ class SwishOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SwishPluginDynamic* plugin =
           new plugin::SwishPluginDynamic(beta, with_fp16);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d12eaf736b754d623c2aa0e3c138a2ad80800b3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class YoloBoxOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid yolo box op to tensorrt plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    std::string X = op_desc.Input("X").front();
+    std::string img_size = op_desc.Input("ImgSize").front();
+
+    auto* X_tensor = engine_->GetITensor(X);
+    auto* img_size_tensor = engine_->GetITensor(img_size);
+
+    int class_num = BOOST_GET_CONST(int, op_desc.GetAttr("class_num"));
+    std::vector<int> anchors =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("anchors"));
+
+    int downsample_ratio =
+        BOOST_GET_CONST(int, op_desc.GetAttr("downsample_ratio"));
+    float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh"));
+    bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox"));
+    float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y"));
+
+    int type_id = static_cast<int>(engine_->WithFp16());
+    auto input_dim = X_tensor->getDimensions();
+    auto* yolo_box_plugin = new plugin::YoloBoxPlugin(
+        type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
+        anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y,
+        input_dim.d[1], input_dim.d[2]);
+
+    std::vector<nvinfer1::ITensor*> yolo_box_inputs;
+    yolo_box_inputs.push_back(X_tensor);
+    yolo_box_inputs.push_back(img_size_tensor);
+
+    auto* yolo_box_layer = engine_->network()->addPluginV2(
+        yolo_box_inputs.data(), yolo_box_inputs.size(), *yolo_box_plugin);
+
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Boxes").front());
+    output_names.push_back(op_desc.Output("Scores").front());
+
+    RreplenishLayerAndOutput(yolo_box_layer, "yolo_box", output_names,
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(yolo_box, YoloBoxOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 0bba4581ff90f931eba9399cb3b0b274342f4f16..99549fd6b5cbf96cf803e7f44b28c948daf0763d 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <string>
 
-#include "cuda_runtime_api.h"
+#include "cuda_runtime_api.h"  // NOLINT
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -353,6 +353,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
   return network()->addPluginExt(inputs, num_inputs, *plugin);
 }
 
+nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2Ext(
+    nvinfer1::ITensor *const *inputs, int num_inputs,
+    plugin::PluginTensorRTV2Ext *plugin) {
+  owned_plugin_v2ext_.emplace_back(plugin);
+  return network()->addPluginV2(inputs, num_inputs, *plugin);
+}
+
 void TensorRTEngine::freshDeviceId() {
   int count;
   cudaGetDeviceCount(&count);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 0e399578fa446793756a23e76013c3ed9a8bb9c4..2358e1ef976cdbc26eb907aff21b81f7e52d64d9 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -305,8 +305,14 @@ class TensorRTEngine {
   }
 
   int GetDeviceId() { return device_id_; }
+
   nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                     int num_inputs, plugin::PluginTensorRT*);
+
+  nvinfer1::IPluginV2Layer* AddPluginV2Ext(nvinfer1::ITensor* const* inputs,
+                                           int num_inputs,
+                                           plugin::PluginTensorRTV2Ext* plugin);
+
   void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) {
     quant_dynamic_range_[tensor] = range;
   }
@@ -372,9 +378,9 @@ class TensorRTEngine {
   bool with_dynamic_shape() { return with_dynamic_shape_; }
 
 #if IS_TRT_VERSION_GE(6000)
-  nvinfer1::IPluginV2Layer* AddPluginV2(nvinfer1::ITensor* const* inputs,
-                                        int num_inputs,
-                                        plugin::DynamicPluginTensorRT* plugin) {
+  nvinfer1::IPluginV2Layer* AddDynamicPlugin(
+      nvinfer1::ITensor* const* inputs, int num_inputs,
+      plugin::DynamicPluginTensorRT* plugin) {
     owned_pluginv2_.emplace_back(plugin);
     return network()->addPluginV2(inputs, num_inputs, *plugin);
   }
@@ -414,6 +420,7 @@ class TensorRTEngine {
       itensor_map_;
 
   std::vector<std::unique_ptr<plugin::PluginTensorRT>> owned_plugin_;
+  std::vector<std::unique_ptr<plugin::PluginTensorRTV2Ext>> owned_plugin_v2ext_;
 
   // TensorRT related internal members
   template <typename T>
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 971f99e69197226bb7d7b26135f0b667f8ebdf30..6158fd130bad8d4df70fafb2a9f72c00e40217fd 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -60,6 +60,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
 static nvinfer1::IPluginRegistry* GetPluginRegistry() {
   return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
 }
+static int GetInferLibVersion() {
+  return static_cast<int>(dy::getInferLibVersion());
+}
 #endif
 
 // A logger for create TensorRT infer builder.
@@ -67,9 +70,12 @@ class NaiveLogger : public nvinfer1::ILogger {
  public:
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
     switch (severity) {
-      case Severity::kINFO:
+      case Severity::kVERBOSE:
         VLOG(3) << msg;
         break;
+      case Severity::kINFO:
+        VLOG(2) << msg;
+        break;
       case Severity::kWARNING:
         LOG(WARNING) << msg;
         break;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 052d17878a5a9dfb3c2b2cae4644b3d4dda2942f..54fc9492b7193e90245cce23538e34a4857cfe1f 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/data_layout.h"
 
 namespace paddle {
 namespace framework {
@@ -41,6 +42,10 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("multihead_matmul");
     teller_set.insert("skip_layernorm");
     teller_set.insert("slice");
+    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
+    int8_teller_set.insert("multihead_matmul");
+    int8_teller_set.insert("skip_layernorm");
+    int8_teller_set.insert("slice");
 #endif
 #if IS_TRT_VERSION_GE(7130)
     teller_set.insert("group_norm");
@@ -60,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller {
   // use this set for no calib int8.
   std::unordered_set<std::string> int8_teller_set{"mul",
                                                   "conv2d",
+                                                  "matmul",
+                                                  "stack",
                                                   "conv2d_fusion",
                                                   "pool2d",
                                                   "relu",
@@ -95,6 +102,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "dropout",
       "prelu",
       "conv2d_transpose",
+      "depthwise_conv2d_transpose",
       "leaky_relu",
       "fc",
       "shuffle_channel",
@@ -109,6 +117,12 @@ struct SimpleOpTypeSetTeller : public Teller {
       "transpose",
       "flatten2",
       "flatten",
+      "gather",
+      "yolo_box",
+      "roi_align",
+      "affine_channel",
+      "nearest_interp",
+      "anchor_generator",
   };
 };
 
@@ -124,13 +138,95 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     return false;
 
   for (auto& teller : tellers_) {
-    if (op_type == "pool2d" || op_type == "conv2d" ||
-        op_type == "depthwise_conv2d" || op_type == "conv2d_transpose") {
+    if (op_type == "depthwise_conv2d") {
       std::vector<int> paddings =
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
 
       if (paddings.size() > 2) return false;
     }
+
+    if (op_type == "pool2d") {
+      std::vector<int> paddings =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
+      if (paddings.size() > 2) return false;
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "TRT Pool2d expect 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "TRT Pool2d has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+      if (!desc.HasAttr("pooling_type")) {
+        return false;
+      } else {
+        std::string pool_type =
+            BOOST_GET_CONST(std::string, desc.GetAttr("pooling_type"));
+        if (pool_type != "max" && pool_type != "avg") {
+          VLOG(3) << "Wrong pool op type, the trt do not support the "
+                  << pool_type << " pool type.";
+          return false;
+        }
+      }
+    }
+
+    if (op_type == "conv2d" || op_type == "conv2d_transpose" ||
+        op_type == "conv2d_fusion" || op_type == "depthwise_conv2d" ||
+        op_type == "depthwise_conv2d_transpose") {
+      std::vector<int> paddings =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
+
+      // conv2d and conv2d_transpose need padding check
+      if (paddings.size() > 2 && op_type != "conv2d_fusion") return false;
+
+      if (desc.Input("Input").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 input, but got "
+                << desc.Input("Input").size() << " input.";
+        return false;
+      }
+
+      if (desc.Input("Filter").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 filter, but got "
+                << desc.Input("Filter").size() << " filter.";
+        return false;
+      }
+
+      if (desc.HasAttr("enable_int8")) {
+        if (op_type == "conv2d" || op_type == "conv2d_fusion") {
+          if (!desc.HasAttr("Input_scale")) {
+            VLOG(3) << "Input scale not found. TRT int8"
+                       " requires conv/deconv to have "
+                       "input quantization scales.";
+            return false;
+          }
+        }
+      }
+
+      if (op_type == "conv2d_transpose" ||
+          op_type == "depthwise_conv2d_transpose") {
+        if (!desc.HasAttr("dilations")) {
+          return false;
+        } else {
+          const std::vector<int> dilations =
+              BOOST_GET_CONST(std::vector<int>, desc.GetAttr("dilations"));
+          if (dilations[0] != 1 || dilations[1] != 1) {
+            VLOG(3) << "In conv2d_transpose, Dilations must be (1, 1) for "
+                       "tensorRT, but given ("
+                    << dilations[0] << ", " << dilations[1] << ")";
+            return false;
+          }
+        }
+      }
+
+      if (desc.Output("Output").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 output, but got "
+                << desc.Output("Output").size() << " output.";
+        return false;
+      }
+    }
+
     if (op_type == "matmul") {
       auto* block = desc.Block();
       for (auto& param_name : desc.Inputs()) {
@@ -138,7 +234,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           auto* var_desc = block->FindVar(var_name);
           const auto shape = var_desc->GetShape();
           if (shape.size() < 3) {
-            VLOG(1)
+            VLOG(3)
                 << "matmul op dims < 3 not supported in tensorrt, but got dims "
                 << shape.size() << ", so jump it.";
             return false;
@@ -159,7 +255,11 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       } else {
         int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
-        if (axis <= 0) return false;
+        if (with_dynamic_shape) {
+          if (axis < 0) return false;
+        } else {
+          if (axis <= 0) return false;
+        }
       }
     }
     if (op_type == "transpose2" || op_type == "transpose") {
@@ -172,7 +272,18 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis.size() >= nvinfer1::Dims::MAX_DIMS) return false;
       }
     }
-    if (op_type == "flatten2" || op_type == "flatten") {
+    if (op_type == "flatten2") {
+      // flatten doesn't support dynamic shape currently
+      if (!desc.HasAttr("axis")) {
+        return false;
+      } else {
+        if (with_dynamic_shape) return false;
+        int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
+        if (axis != 1) return false;
+      }
+    }
+
+    if (op_type == "flatten") {
       // flatten doesn't support dynamic shape currently
       if (!desc.HasAttr("axis")) {
         return false;
@@ -182,6 +293,360 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis != 1) return false;
       }
     }
+
+    if (op_type == "gather") {
+      // current not support axis from input, use default 0
+      if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
+    }
+
+    if (op_type == "yolo_box") {
+      if (with_dynamic_shape) return false;
+      bool has_attrs =
+          (desc.HasAttr("class_num") && desc.HasAttr("anchors") &&
+           desc.HasAttr("downsample_ratio") && desc.HasAttr("conf_thresh") &&
+           desc.HasAttr("clip_bbox") && desc.HasAttr("scale_x_y"));
+      if (!has_attrs) return false;
+    }
+
+    if (op_type == "affine_channel") {
+      if (!desc.HasAttr("data_layout")) return false;
+      auto data_layout = framework::StringToDataLayout(
+          BOOST_GET_CONST(std::string, desc.GetAttr("data_layout")));
+      if (data_layout != framework::DataLayout::kNCHW) return false;
+    }
+
+    if (op_type == "multiclass_nms") {
+      if (with_dynamic_shape) return false;
+      auto* block = desc.Block();
+      for (auto& param_name : desc.Inputs()) {
+        for (auto& var_name : param_name.second) {
+          auto* var_desc = block->FindVar(var_name);
+          const auto shape = var_desc->GetShape();
+          if (shape.size() != 3) {
+            VLOG(3) << "multiclass_nms op dims != 3 not supported in tensorrt, "
+                       "but got dims "
+                    << shape.size() << ", so jump it.";
+            return false;
+          }
+        }
+      }
+      bool has_attrs =
+          (desc.HasAttr("background_label") &&
+           desc.HasAttr("score_threshold") && desc.HasAttr("nms_top_k") &&
+           desc.HasAttr("keep_top_k") && desc.HasAttr("normalized"));
+      if (has_attrs == false) return false;
+
+      auto nms_top_k = BOOST_GET_CONST(int, desc.GetAttr("nms_top_k"));
+      if (nms_top_k < 0) return false;
+
+      auto keep_top_k = BOOST_GET_CONST(int, desc.GetAttr("keep_top_k"));
+      if (keep_top_k < 0) return false;
+
+      auto registry = GetPluginRegistry();
+      if (registry == nullptr) return false;
+    }
+
+    if (op_type == "nearest_interp") {
+      std::vector<std::string> attrs{"data_layout",   "interp_method",
+                                     "align_corners", "scale",
+                                     "out_h",         "out_w"};
+      for (auto const attr : attrs) {
+        if (!desc.HasAttr(attr)) return false;
+      }
+      auto data_layout = framework::StringToDataLayout(
+          BOOST_GET_CONST(std::string, desc.GetAttr("data_layout")));
+      if (data_layout != framework::DataLayout::kNCHW &&
+          data_layout != framework::DataLayout::kNHWC)
+        return false;
+      auto interp_method =
+          BOOST_GET_CONST(std::string, desc.GetAttr("interp_method"));
+      if (interp_method != "nearest") return false;
+
+      if (!desc.HasAttr("scale") || !desc.HasAttr("out_h") ||
+          !desc.HasAttr("out_w")) {
+        return false;
+      } else {
+        auto scale = BOOST_GET_CONST(float, desc.GetAttr("scale"));
+        auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h"));
+        auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w"));
+        if (!(scale > 0.f && (out_h <= 0 && out_w <= 0))) {
+          if (out_h <= 0) {
+            VLOG(3) << "out_h must be greater than 0 if scale is not set.";
+            return false;
+          }
+          if (out_w <= 0) {
+            VLOG(3) << "out_w must be greater than 0 if scale is not set.";
+            return false;
+          }
+        }
+      }
+    }
+
+    if (op_type == "roi_align") {
+      if (!with_dynamic_shape) return false;
+
+      std::vector<std::string> attrs{"pooled_height", "pooled_width",
+                                     "spatial_scale", "sampling_ratio"};
+      for (auto const attr : attrs) {
+        if (!desc.HasAttr(attr)) return false;
+      }
+
+      const auto pooled_height =
+          BOOST_GET_CONST(int, desc.GetAttr("pooled_height"));
+      if (pooled_height <= 0) return false;
+
+      const auto pooled_width =
+          BOOST_GET_CONST(int, desc.GetAttr("pooled_width"));
+      if (pooled_width <= 0) return false;
+
+      const auto spatial_scale =
+          BOOST_GET_CONST(float, desc.GetAttr("spatial_scale"));
+      if (spatial_scale <= 0.f) return false;
+    }
+
+    if (op_type == "hard_swish") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "HardSwish op has only 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "HardSwish op has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "batch_norm") {
+      const std::vector<std::string> bn_inputs = {"X", "Bias", "Mean", "Scale",
+                                                  "Variance"};
+      for (unsigned int i = 0; i < bn_inputs.size(); i++) {
+        if (desc.Input(bn_inputs[i]).size() != 1) {
+          VLOG(3) << "Invalid " << bn_inputs[i]
+                  << "'s size of batch_norm TRT "
+                     "converter. Expected 1, received "
+                  << desc.Input(bn_inputs[i]).size() << ".";
+          return false;
+        }
+      }
+
+      if (desc.Output("Y").size() != 1) {
+        VLOG(3) << "Invalid output Y's size of batch_norm TRT "
+                   "converter. Expected 1, received "
+                << desc.Output("Y").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "split") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid input X's size of split TRT converter. "
+                   "Expected 1, received "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (!desc.HasAttr("axis")) {
+        return false;
+      } else {
+        int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
+        if (axis == 0) {
+          VLOG(3) << "Invalid split axis. Split on batch is not supported in "
+                     "TensorRT";
+          return false;
+        }
+      }
+    }
+
+    if (op_type == "slice") {
+      if (!desc.HasAttr("axes") || !desc.HasAttr("starts") ||
+          !desc.HasAttr("ends")) {
+        return false;
+      } else {
+        std::vector<int> axes =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("axes"));
+        std::vector<int> starts =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("starts"));
+        std::vector<int> ends =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("ends"));
+        if (axes.size() != starts.size() || axes.size() != ends.size()) {
+          return false;
+        }
+        if (!with_dynamic_shape) {
+          for (size_t i = 0; i < axes.size(); i++) {
+            if (axes[i] == 0) {
+              VLOG(3) << "Invalid slice axis. Slice on batch axis is not "
+                         "supported in TensorRT";
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    if (op_type == "elementwise_add" || op_type == "elementwise_mul") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "The input op's Input(\"X\").size() "
+                   "should equal to 1, but received Input(\"X\").size() = "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (desc.Input("Y").size() != 1) {
+        VLOG(3) << "The input op's Input(\"Y\").size() "
+                   "should equal to 1, but received Input(\"Y\").size() = "
+                << desc.Input("Y").size() << ".";
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "The input op's Output(\"Out\").size() "
+                   "should equal to 1, but reveceid Output(\"Out\").size() = "
+                << desc.Output("Out").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "stack") {
+      if (!with_dynamic_shape) {
+        VLOG(3)
+            << "static shape mode is not supported for TRT stack.\n"
+               "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+               " to set the shape information to run the dynamic shape "
+               "mode.";
+        return false;
+      }
+    }
+
+    if (op_type == "fused_embedding_eltwise_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "fused_embedding_eltwise_layernorm should run on dynamic "
+                   "shape mode.";
+        return false;
+      }
+      if (desc.Input("Ids").size() != desc.Input("Embs").size()) {
+        VLOG(3) << "The id and emb size of fused EmbEltwiseLayerNormOp "
+                   "should be same ";
+        return false;
+      }
+    }
+
+    if (op_type == "gelu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "gelu op has only 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "gelu op has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "layer_norm") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "input of layer_norm op converter should be 1, got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Input("Bias").size() != 1) {
+        VLOG(3) << "Bias of layer_norm op converter should be 1, got "
+                << desc.Input("Bias").size();
+        return false;
+      }
+      if (desc.Input("Scale").size() != 1) {
+        VLOG(3) << "Scale of layer_norm op converter should be 1, got "
+                << desc.Input("Scale").size();
+        return false;
+      }
+      if (desc.Output("Y").size() != 1) {
+        VLOG(3) << "output of layer_norm op converter should be 1, got "
+                << desc.Output("Y").size();
+        return false;
+      }
+    }
+
+    if (op_type == "leaky_relu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid number of TRT leaky_relu op converter "
+                   "inputs. Expected 1, but received "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "output of leaky_relu op converter should be 1, got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "pad") {
+      const float pad_value = BOOST_GET_CONST(float, desc.GetAttr("pad_value"));
+      if (pad_value != 0.0f) {
+        VLOG(3) << "The pad layer of TRT only support zero.";
+        return false;
+      }
+    }
+
+    if (op_type == "prelu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid input X's size of prelu TRT converter. "
+                   "Expected 1, received "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "Invalid output Out's size of prelu TRT converter. "
+                   "Expected 1, received "
+                << desc.Output("Out").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "roi_align") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "TRT roi align plugin only accept the dynamic shape, "
+                   "because that "
+                   "the roi_align will change the batch size.";
+        return false;
+      }
+    }
+
+    if (op_type == "shuffle_channel") {
+      if (with_dynamic_shape) {
+        VLOG(3) << "You are running the TRT Dynamic Shape mode, "
+                   "the shuffle_channel op does not support dynamic shape yet";
+        return false;
+      }
+    }
+
+    if (op_type == "skip_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the skip_layernorm does not support static shape yet";
+        return false;
+      }
+    }
+
+    if (op_type == "multihead_matmul") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the multihead_matmul does not support static shape yet";
+        return false;
+      }
+    }
+
+    if (op_type == "fc") {
+      int x_num_col_dims =
+          desc.HasAttr("x_num_col_dims")
+              ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
+              : (desc.HasAttr("in_num_col_dims")
+                     ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
+                     : 1);
+      if (x_num_col_dims < 1) {
+        VLOG(3) << "converter expects x_num_col_dims >= 1, "
+                   "but x_num_col_dims = %d.";
+        return false;
+      }
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index e37beb3b8e5c3680eda481009699091dcc1ee7a3..1804e6c5571d3a15b0b9adc67dc535b46635caa8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -5,4 +5,10 @@ nv_library(tensorrt_plugin
            instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
            qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
            hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
+           anchor_generator_op_plugin.cu
+           yolo_box_op_plugin.cu
+           roi_align_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
+
+nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
+  paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin)
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
new file mode 100644
index 0000000000000000000000000000000000000000..01ee86ceb48a9ef022ba73fe0dbdab4a52324cc6
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
@@ -0,0 +1,566 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <cassert>
+
+#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+#include "paddle/fluid/operators/detection/anchor_generator_op.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#define PrepareParamsOnDevice()                                          \
+  constexpr int data_size = 4;                                           \
+  cudaMalloc(&anchor_sizes_device_, anchor_sizes_.size() * data_size);   \
+  cudaMalloc(&aspect_ratios_device_, aspect_ratios_.size() * data_size); \
+  cudaMalloc(&stride_device_, stride_.size() * data_size);               \
+  cudaMalloc(&variances_device_, variances_.size() * data_size);         \
+  cudaMemcpy(anchor_sizes_device_, anchor_sizes_.data(),                 \
+             anchor_sizes_.size() * data_size, cudaMemcpyHostToDevice);  \
+  cudaMemcpy(aspect_ratios_device_, aspect_ratios_.data(),               \
+             aspect_ratios_.size() * data_size, cudaMemcpyHostToDevice); \
+  cudaMemcpy(stride_device_, stride_.data(), stride_.size() * data_size, \
+             cudaMemcpyHostToDevice);                                    \
+  cudaMemcpy(variances_device_, variances_.data(),                       \
+             variances_.size() * data_size, cudaMemcpyHostToDevice);
+
+AnchorGeneratorPlugin::AnchorGeneratorPlugin(
+    const nvinfer1::DataType data_type, const std::vector<float>& anchor_sizes,
+    const std::vector<float>& aspect_ratios, const std::vector<float>& stride,
+    const std::vector<float>& variances, const float offset, const int height,
+    const int width, const int num_anchors, const int box_num)
+    : data_type_(data_type),
+      anchor_sizes_(anchor_sizes),
+      aspect_ratios_(aspect_ratios),
+      stride_(stride),
+      variances_(variances),
+      offset_(offset),
+      height_(height),
+      width_(width),
+      num_anchors_(num_anchors),
+      box_num_(box_num) {
+  // anchors must be float32, which is the generator proposals' input
+  PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts float32."));
+  PADDLE_ENFORCE_GE(height_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts height "
+                        "greater than 0, but receive height = %d.",
+                        height_));
+  PADDLE_ENFORCE_GE(width_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts width "
+                        "greater than 0, but receive width = %d.",
+                        width_));
+  PADDLE_ENFORCE_GE(
+      num_anchors_, 0,
+      platform::errors::InvalidArgument(
+          "TRT anchor generator plugin only accepts number of anchors greater "
+          "than 0, but receive number of anchors = %d.",
+          num_anchors_));
+  PADDLE_ENFORCE_GE(box_num_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts box_num "
+                        "greater than 0, but receive box_num = %d.",
+                        box_num_));
+  PrepareParamsOnDevice();
+}
+
+AnchorGeneratorPlugin::~AnchorGeneratorPlugin() {
+  auto release_device_ptr = [](void* ptr) {
+    if (ptr) {
+      cudaFree(ptr);
+      ptr = nullptr;
+    }
+  };
+  release_device_ptr(anchor_sizes_device_);
+  release_device_ptr(aspect_ratios_device_);
+  release_device_ptr(stride_device_);
+  release_device_ptr(variances_device_);
+}
+
+AnchorGeneratorPlugin::AnchorGeneratorPlugin(const void* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchor_sizes_);
+  DeserializeValue(&data, &length, &aspect_ratios_);
+  DeserializeValue(&data, &length, &stride_);
+  DeserializeValue(&data, &length, &variances_);
+  DeserializeValue(&data, &length, &offset_);
+  DeserializeValue(&data, &length, &height_);
+  DeserializeValue(&data, &length, &width_);
+  DeserializeValue(&data, &length, &num_anchors_);
+  DeserializeValue(&data, &length, &box_num_);
+  PrepareParamsOnDevice();
+}
+
+const char* AnchorGeneratorPlugin::getPluginType() const {
+  return "anchor_generator_plugin";
+}
+
+const char* AnchorGeneratorPlugin::getPluginVersion() const { return "1"; }
+
+int AnchorGeneratorPlugin::getNbOutputs() const { return 2; }
+
+nvinfer1::Dims AnchorGeneratorPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputs, int nb_input_dims) {
+  nvinfer1::Dims dims{};
+  dims.nbDims = 4;
+  dims.d[0] = height_;
+  dims.d[1] = width_;
+  dims.d[2] = num_anchors_;
+  dims.d[3] = 4;
+  return dims;
+}
+
+bool AnchorGeneratorPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::TensorFormat format) const {
+  // static shape plugin can't support different type between input/out
+  // it may cause addition overhead in half mode
+  return (type == data_type_ && format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const {
+  return 0;
+}
+
+template <typename T>
+int AnchorGeneratorPlugin::enqueue_impl(int batch_size,
+                                        const void* const* inputs,
+                                        void** outputs, void* workspace,
+                                        cudaStream_t stream) {
+  const int block = 512;
+  const int gen_anchor_grid = (box_num_ + block - 1) / block;
+  T* anchors = static_cast<T*>(outputs[0]);
+  T* vars = static_cast<T*>(outputs[1]);
+  const T* anchor_sizes_device = static_cast<const T*>(anchor_sizes_device_);
+  const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
+  const T* stride_device = static_cast<const T*>(stride_device_);
+  const T* variances_device = static_cast<const T*>(variances_device_);
+  paddle::operators::GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(
+      anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device,
+      anchor_sizes_.size(), stride_device, stride_.size(), height_, width_,
+      offset_);
+  const int var_grid = (box_num_ * 4 + block - 1) / block;
+  paddle::operators::SetVariance<T><<<var_grid, block, 0, stream>>>(
+      vars, variances_device, variances_.size(), box_num_ * 4);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int AnchorGeneratorPlugin::enqueue(int batch_size, const void* const* inputs,
+                                   void** outputs, void* workspace,
+                                   cudaStream_t stream) {
+  return enqueue_impl<float>(batch_size, inputs, outputs, workspace, stream);
+}
+
+int AnchorGeneratorPlugin::initialize() { return 0; }
+
+void AnchorGeneratorPlugin::terminate() {}
+
+size_t AnchorGeneratorPlugin::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchor_sizes_);
+  serialize_size += SerializedSize(aspect_ratios_);
+  serialize_size += SerializedSize(stride_);
+  serialize_size += SerializedSize(variances_);
+  serialize_size += SerializedSize(offset_);
+  serialize_size += SerializedSize(height_);
+  serialize_size += SerializedSize(width_);
+  serialize_size += SerializedSize(num_anchors_);
+  serialize_size += SerializedSize(box_num_);
+  return serialize_size;
+}
+
+void AnchorGeneratorPlugin::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchor_sizes_);
+  SerializeValue(&buffer, aspect_ratios_);
+  SerializeValue(&buffer, stride_);
+  SerializeValue(&buffer, variances_);
+  SerializeValue(&buffer, offset_);
+  SerializeValue(&buffer, height_);
+  SerializeValue(&buffer, width_);
+  SerializeValue(&buffer, num_anchors_);
+  SerializeValue(&buffer, box_num_);
+}
+
+void AnchorGeneratorPlugin::destroy() {}
+
+void AnchorGeneratorPlugin::setPluginNamespace(const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* AnchorGeneratorPlugin::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+nvinfer1::DataType AnchorGeneratorPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_type, int nb_inputs) const {
+  return data_type_;
+}
+
+bool AnchorGeneratorPlugin::isOutputBroadcastAcrossBatch(
+    int output_index, const bool* input_is_broadcast, int nb_inputs) const {
+  return true;
+}
+
+bool AnchorGeneratorPlugin::canBroadcastInputAcrossBatch(
+    int input_index) const {
+  return false;
+}
+
+void AnchorGeneratorPlugin::configurePlugin(
+    const nvinfer1::Dims* input_dims, int nb_inputs,
+    const nvinfer1::Dims* output_dims, int nb_outputs,
+    const nvinfer1::DataType* input_types,
+    const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
+    const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
+    int max_batct_size) {}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPlugin::clone() const {
+  auto plugin = new AnchorGeneratorPlugin(
+      data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_,
+      height_, width_, num_anchors_, box_num_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+void AnchorGeneratorPluginCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* AnchorGeneratorPluginCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* AnchorGeneratorPluginCreator::getPluginName() const {
+  return "anchor_generator_plugin";
+}
+
+const char* AnchorGeneratorPluginCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+AnchorGeneratorPluginCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+  int type_id = -1;
+  std::vector<float> anchor_sizes, aspect_ratios, stride, variances;
+  float offset = .5;
+  int height = -1, width = -1;
+  int num_anchors = -1;
+  int box_num = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    const auto length = fc->fields[i].length;
+    if (field_name.compare("type_id") == 0) {
+      type_id = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("anchor_sizes")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      anchor_sizes.insert(anchor_sizes.end(), data, data + length);
+    } else if (field_name.compare("aspect_ratios")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      aspect_ratios.insert(aspect_ratios.end(), data, data + length);
+    } else if (field_name.compare("stride")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      stride.insert(stride.end(), data, data + length);
+    } else if (field_name.compare("variances")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      variances.insert(variances.end(), data, data + length);
+    } else if (field_name.compare("offset")) {
+      offset = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("height")) {
+      height = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("width")) {
+      width = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("num_anchors")) {
+      num_anchors = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("box_num")) {
+      box_num = *static_cast<const int*>(fc->fields[i].data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+  return new AnchorGeneratorPlugin(nvinfer1::DataType::kFLOAT, anchor_sizes,
+                                   aspect_ratios, stride, variances, offset,
+                                   height, width, num_anchors, box_num);
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new AnchorGeneratorPlugin(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+#if IS_TRT_VERSION_GE(6000)
+AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(
+    const nvinfer1::DataType data_type, const std::vector<float>& anchor_sizes,
+    const std::vector<float>& aspect_ratios, const std::vector<float>& stride,
+    const std::vector<float>& variances, const float offset,
+    const int num_anchors)
+    : data_type_(data_type),
+      anchor_sizes_(anchor_sizes),
+      aspect_ratios_(aspect_ratios),
+      stride_(stride),
+      variances_(variances),
+      offset_(offset),
+      num_anchors_(num_anchors) {
+  // data_type_ is used to determine the output data type
+  // data_type_ can only be float32
+  // height, width, num_anchors are calculated at configurePlugin
+  PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts float32."));
+  PADDLE_ENFORCE_GE(
+      num_anchors_, 0,
+      platform::errors::InvalidArgument(
+          "TRT anchor generator plugin only accepts number of anchors greater "
+          "than 0, but receive number of anchors = %d.",
+          num_anchors_));
+  PrepareParamsOnDevice();
+}
+
+AnchorGeneratorPluginDynamic::~AnchorGeneratorPluginDynamic() {
+  auto release_device_ptr = [](void* ptr) {
+    if (ptr) {
+      cudaFree(ptr);
+      ptr = nullptr;
+    }
+  };
+  release_device_ptr(anchor_sizes_device_);
+  release_device_ptr(aspect_ratios_device_);
+  release_device_ptr(stride_device_);
+  release_device_ptr(variances_device_);
+}
+
+AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(void const* data,
+                                                           size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchor_sizes_);
+  DeserializeValue(&data, &length, &aspect_ratios_);
+  DeserializeValue(&data, &length, &stride_);
+  DeserializeValue(&data, &length, &variances_);
+  DeserializeValue(&data, &length, &offset_);
+  DeserializeValue(&data, &length, &num_anchors_);
+  PrepareParamsOnDevice();
+}
+
+nvinfer1::IPluginV2DynamicExt* AnchorGeneratorPluginDynamic::clone() const {
+  auto plugin = new AnchorGeneratorPluginDynamic(
+      data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_,
+      num_anchors_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs AnchorGeneratorPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) {
+  nvinfer1::DimsExprs ret{};
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[2];  // feature height
+  ret.d[1] = inputs[0].d[3];  // feature width
+  ret.d[2] = exprBuilder.constant(num_anchors_);
+  ret.d[3] = exprBuilder.constant(4);
+  return ret;
+}
+
+bool AnchorGeneratorPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) {
+  // input can be any, doesn't matter
+  // anchor generator doesn't read input raw data, only need the shape info
+  auto type = inOut[pos].type;
+  auto format = inOut[pos].format;
+#if IS_TRT_VERSION_GE(7234)
+  if (pos == 0) return true;
+#else
+  if (pos == 0) return format == nvinfer1::TensorFormat::kLINEAR;
+#endif
+  return (type == nvinfer1::DataType::kFLOAT &&
+          format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+void AnchorGeneratorPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t AnchorGeneratorPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return 0;
+}
+
+template <typename T>
+int AnchorGeneratorPluginDynamic::enqueue_impl(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  const int height = inputDesc[0].dims.d[2];
+  const int width = inputDesc[0].dims.d[3];
+  const int box_num = height * width * num_anchors_;
+  const int block = 512;
+  const int gen_anchor_grid = (box_num + block - 1) / block;
+  T* anchors = static_cast<T*>(outputs[0]);
+  T* vars = static_cast<T*>(outputs[1]);
+  const T* anchor_sizes_device = static_cast<const T*>(anchor_sizes_device_);
+  const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
+  const T* stride_device = static_cast<const T*>(stride_device_);
+  const T* variances_device = static_cast<const T*>(variances_device_);
+  paddle::operators::GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(
+      anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device,
+      anchor_sizes_.size(), stride_device, stride_.size(), height, width,
+      offset_);
+  const int var_grid = (box_num * 4 + block - 1) / block;
+  paddle::operators::SetVariance<T><<<var_grid, block, 0, stream>>>(
+      vars, variances_device, variances_.size(), box_num * 4);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int AnchorGeneratorPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  assert(outputDesc[0].type == nvinfer1::DataType::kFLOAT);
+  assert(outputDesc[1].type == nvinfer1::DataType::kFLOAT);
+  return enqueue_impl<float>(inputDesc, outputDesc, inputs, outputs, workspace,
+                             stream);
+}
+
+nvinfer1::DataType AnchorGeneratorPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+  return data_type_;
+}
+
+const char* AnchorGeneratorPluginDynamic::getPluginType() const {
+  return "anchor_generator_plugin_dynamic";
+}
+
+int AnchorGeneratorPluginDynamic::getNbOutputs() const { return 2; }
+
+int AnchorGeneratorPluginDynamic::initialize() { return 0; }
+
+void AnchorGeneratorPluginDynamic::terminate() {}
+
+size_t AnchorGeneratorPluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchor_sizes_);
+  serialize_size += SerializedSize(aspect_ratios_);
+  serialize_size += SerializedSize(stride_);
+  serialize_size += SerializedSize(variances_);
+  serialize_size += SerializedSize(offset_);
+  serialize_size += SerializedSize(num_anchors_);
+  return serialize_size;
+}
+
+void AnchorGeneratorPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchor_sizes_);
+  SerializeValue(&buffer, aspect_ratios_);
+  SerializeValue(&buffer, stride_);
+  SerializeValue(&buffer, variances_);
+  SerializeValue(&buffer, offset_);
+  SerializeValue(&buffer, num_anchors_);
+}
+
+void AnchorGeneratorPluginDynamic::destroy() {}
+
+void AnchorGeneratorPluginDynamicCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* AnchorGeneratorPluginDynamicCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* AnchorGeneratorPluginDynamicCreator::getPluginName() const {
+  return "anchor_generator_plugin_dynamic";
+}
+
+const char* AnchorGeneratorPluginDynamicCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+AnchorGeneratorPluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+  int type_id = -1;
+  std::vector<float> anchor_sizes, aspect_ratios, stride, variances;
+  float offset = .5;
+  int num_anchors = -1;
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    const auto length = fc->fields[i].length;
+    if (field_name.compare("type_id") == 0) {
+      type_id = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("anchor_sizes")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      anchor_sizes.insert(anchor_sizes.end(), data, data + length);
+    } else if (field_name.compare("aspect_ratios")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      aspect_ratios.insert(aspect_ratios.end(), data, data + length);
+    } else if (field_name.compare("stride")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      stride.insert(stride.end(), data, data + length);
+    } else if (field_name.compare("variances")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      variances.insert(variances.end(), data, data + length);
+    } else if (field_name.compare("offset")) {
+      offset = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("num_anchors")) {
+      num_anchors = *static_cast<const int*>(fc->fields[i].data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+  return new AnchorGeneratorPluginDynamic(nvinfer1::DataType::kFLOAT,
+                                          anchor_sizes, aspect_ratios, stride,
+                                          variances, offset, num_anchors);
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new AnchorGeneratorPluginDynamic(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..aff0b6a6802f114a25acf32627a39ca42d572d7c
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
@@ -0,0 +1,201 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class AnchorGeneratorPlugin : public nvinfer1::IPluginV2Ext {
+ public:
+  explicit AnchorGeneratorPlugin(
+      const nvinfer1::DataType, const std::vector<float>& anchor_sizes,
+      const std::vector<float>& aspect_ratios, const std::vector<float>& stride,
+      const std::vector<float>& variances, const float offset, const int height,
+      const int width, const int num_anchors, const int box_num);
+  AnchorGeneratorPlugin(const void* data, size_t length);
+  ~AnchorGeneratorPlugin() override;
+  const char* getPluginType() const override;
+  const char* getPluginVersion() const override;
+  int getNbOutputs() const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::TensorFormat format) const override;
+  size_t getWorkspaceSize(int max_batch_size) const override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_type,
+                                       int nb_inputs) const override;
+  bool isOutputBroadcastAcrossBatch(int output_index,
+                                    const bool* input_is_broadcast,
+                                    int nb_inputs) const override;
+  bool canBroadcastInputAcrossBatch(int input_index) const override;
+  void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs,
+                       const nvinfer1::Dims* output_dims, int nb_outputs,
+                       const nvinfer1::DataType* input_types,
+                       const nvinfer1::DataType* output_types,
+                       const bool* input_is_broadcast,
+                       const bool* output_is_broadcast,
+                       nvinfer1::PluginFormat float_format,
+                       int max_batct_size) override;
+  nvinfer1::IPluginV2Ext* clone() const override;
+
+ private:
+  template <typename T>
+  int enqueue_impl(int batch_size, const void* const* inputs, void** outputs,
+                   void* workspace, cudaStream_t stream);
+  nvinfer1::DataType data_type_;
+  std::vector<float> anchor_sizes_;
+  std::vector<float> aspect_ratios_;
+  std::vector<float> stride_;
+  std::vector<float> variances_;
+  float offset_;
+  void* anchor_sizes_device_;
+  void* aspect_ratios_device_;
+  void* stride_device_;
+  void* variances_device_;
+  int height_;
+  int width_;
+  int num_anchors_;
+  int box_num_;
+  std::string namespace_;
+};
+
+class AnchorGeneratorPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  AnchorGeneratorPluginCreator() = default;
+  ~AnchorGeneratorPluginCreator() override = default;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginCreator);
+
+#if IS_TRT_VERSION_GE(6000)
+class AnchorGeneratorPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit AnchorGeneratorPluginDynamic(const nvinfer1::DataType data_type,
+                                        const std::vector<float>& anchor_sizes,
+                                        const std::vector<float>& aspect_ratios,
+                                        const std::vector<float>& stride,
+                                        const std::vector<float>& variances,
+                                        const float offset,
+                                        const int num_anchors);
+  AnchorGeneratorPluginDynamic(void const* data, size_t length);
+  ~AnchorGeneratorPluginDynamic();
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  template <typename T>
+  int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc,
+                   const nvinfer1::PluginTensorDesc* outputDesc,
+                   const void* const* inputs, void* const* outputs,
+                   void* workspace, cudaStream_t stream);
+  nvinfer1::DataType data_type_;
+  std::vector<float> anchor_sizes_;
+  std::vector<float> aspect_ratios_;
+  std::vector<float> stride_;
+  std::vector<float> variances_;
+  float offset_;
+  void* anchor_sizes_device_;
+  void* aspect_ratios_device_;
+  void* stride_device_;
+  void* variances_device_;
+  int num_anchors_;
+  std::string namespace_;
+};
+
+class AnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  AnchorGeneratorPluginDynamicCreator() = default;
+  ~AnchorGeneratorPluginDynamicCreator() override = default;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 457d9dd87375477926480bce0a84e8f89c409698..cc17f8aa2481708e3e19c9925a1d83ad06203145 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -152,9 +152,14 @@ int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs,
 
 int ElementwisePluginDynamic::initialize() { return 0; }
 
-size_t ElementwisePluginDynamic::getSerializationSize() const { return 0; }
+size_t ElementwisePluginDynamic::getSerializationSize() const {
+  return SerializedSize(type_.c_str()) + SerializedSize(axis_);
+}
 
-void ElementwisePluginDynamic::serialize(void *buffer) const {}
+void ElementwisePluginDynamic::serialize(void *buffer) const {
+  SerializeValue(&buffer, type_.c_str());
+  SerializeValue(&buffer, axis_);
+}
 
 nvinfer1::DimsExprs ElementwisePluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
index e37511868d88f600a733df4ebb478e74a385be1b..75a1dd85f0f2c440fdd16beb95144df4127739e6 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -92,7 +92,12 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
  public:
   explicit ElementwisePluginDynamic(const std::string& type, int axis)
       : type_(type), axis_(axis) {}
-  ElementwisePluginDynamic(void const* serialData, size_t serialLength) {}
+  ElementwisePluginDynamic(void const* serialData, size_t serialLength) {
+    const char* elementwise_type;
+    DeserializeValue(&serialData, &serialLength, &elementwise_type);
+    type_ = std::string(elementwise_type);
+    DeserializeValue(&serialData, &serialLength, &axis_);
+  }
   nvinfer1::IPluginV2DynamicExt* clone() const override {
     return new ElementwisePluginDynamic(type_, axis_);
   }
@@ -138,6 +143,46 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
   std::string type_;
   int axis_;
 };
+
+class ElementwisePluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  ElementwisePluginDynamicCreator() {}
+  const char* getPluginName() const override { return "elementwise_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new ElementwisePluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(ElementwisePluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 238daa4a886a48036cdcd29a1173509b791254fd..6d3872aaeb8a77acf1455e4d5e555ee01d36478a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -200,12 +200,10 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
                         "but it's (%d)",
                         output_index));
   nvinfer1::DimsExprs ret;
-  ret.nbDims = 5;
+  ret.nbDims = 3;
   ret.d[0] = inputs[0].d[0];
   ret.d[1] = inputs[0].d[1];
   ret.d[2] = expr_builder.constant(hidden_size_);
-  ret.d[3] = expr_builder.constant(1);
-  ret.d[4] = expr_builder.constant(1);
   return ret;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index 6c8381a750cba96796dd063dd54de779b5933a9f..7de84a8fc49bcc4cf94e8d406ab0362cbacfb175 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -306,9 +306,10 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   }
 };
 
-class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
+class EmbEltwiseLayernormPluginDynamicCreator
+    : public nvinfer1::IPluginCreator {
  public:
-  EmbEltwiseLayernormPluginV2Creator() {}
+  EmbEltwiseLayernormPluginDynamicCreator() {}
   const char* getPluginName() const override {
     return "fused_embedding_eltwise_layernorm_plugin";
   }
@@ -345,7 +346,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginDynamicCreator);
 
 #endif
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
index 979f600a3a9cea0ab5bc35fc0c2882cf34c82c98..23e507ee477e1a3b85339c7b267b290de19805ab 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
@@ -115,9 +115,9 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
   void destroy() override { delete this; }
 };
 
-class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
+class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  GeluPluginV2Creator() {}
+  GeluPluginDynamicCreator() {}
   const char* getPluginName() const override { return "gelu_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -153,7 +153,7 @@ class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(GeluPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 1e7c83f4c60fb99964bf583087e7fd1f8c32d704..214e1a81e7dc04161a07f4c0bec643bf65b6c9f0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -169,12 +169,10 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
           "it has (%d) inputs",
           nb_inputs));
   nvinfer1::DimsExprs ret;
-  ret.nbDims = 5;
+  ret.nbDims = 3;
   ret.d[0] = inputs[0].d[0];
   ret.d[1] = inputs[0].d[1];
   ret.d[2] = expr_builder.constant(head_size_ * head_number_);
-  ret.d[3] = expr_builder.constant(1);
-  ret.d[4] = expr_builder.constant(1);
   return ret;
 }
 
@@ -227,6 +225,14 @@ nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType(
   return input_types[0];
 }
 
+template <typename T>
+__global__ void apply_scale(T *data, T scale, int n) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  data[tid] = data[tid] * scale;
+#endif
+}
+
 int QkvToContextPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
@@ -293,10 +299,17 @@ int QkvToContextPluginDynamic::enqueue(
         platform::DeviceContextPool::Instance().Get(
             platform::CUDAPlace(device_id)));
 
+    int n_q = seq_len * head_number_ * head_size_;
+    constexpr int threads = 128;
+    int blocks = (n_q + threads - 1) / threads;
+
+    apply_scale<<<blocks, threads, 0, stream>>>(tptr, static_cast<half>(scale_),
+                                                n_q);
+
     const platform::CUDADeviceContext &dev_ctx = *device_ctx;
     operators::math::MultiHeadGPUComputeFunctor<half> multihead_compute_func;
     multihead_compute_func(dev_ctx, batch, seq_len, head_number_, head_size_,
-                           qkptr, input1_data, tptr, half(scale_), half(0.0));
+                           qkptr, input1_data, tptr, half(1.), half(0.0));
 
     int grid = batch * head_number_ * seq_len;
     int block = head_size_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
index b852f5a454c07ca9684f7bb12aa62275c3121de3..7147d9855755bec0fff814c32edf391269b6fe03 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@@ -118,9 +118,9 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
   float scale_;
 };
 
-class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
+class QkvToContextPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  QkvToContextPluginV2Creator() {}
+  QkvToContextPluginDynamicCreator() {}
   const char* getPluginName() const override { return "qkv_to_context_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -155,7 +155,7 @@ class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::PluginFieldCollection field_collection_;
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
-REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(QkvToContextPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e7ed0054f502ea014d3648ac0be22c167987735
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -0,0 +1,381 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+
+#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+template <class T>
+__inline__ __device__ T BilinearInterpolate(const T* input_data,
+                                            const int height, const int width,
+                                            T y, T x) {
+  if (y < -1.f || y > height || x < -1.f || x > width) return 0;
+  y = y <= 0.f ? 0.f : y;
+  x = x <= 0.f ? 0.f : x;
+  int y_low = static_cast<int>(y);
+  int x_low = static_cast<int>(x);
+  int y_high;
+  int x_high;
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = static_cast<T>(y_low);
+  } else {
+    y_high = y_low + 1;
+  }
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = static_cast<T>(x_low);
+  } else {
+    x_high = x_low + 1;
+  }
+  T ly = y - y_low, lx = x - x_low;
+  T hy = 1.f - ly, hx = 1.f - lx;
+  T v1 = input_data[y_low * width + x_low];
+  T v2 = input_data[y_low * width + x_high];
+  T v3 = input_data[y_high * width + x_low];
+  T v4 = input_data[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T, typename OutT, bool USE_SMEM>
+__global__ void GPUROIAlignOpt(const int nthreads,
+                               const T* __restrict__ input_data,
+                               const T* __restrict__ input_rois,
+                               const float spatial_scale, const int channels,
+                               const int height, const int width,
+                               const int pooled_height, const int pooled_width,
+                               const int sampling_ratio, const int num_rois,
+                               OutT* __restrict__ output_data) {
+  const int batch = blockIdx.x;
+  const int channel = blockIdx.y;
+  const T* offset_input_data =
+      input_data + (batch * channels + channel) * height * width;
+  extern __shared__ T s_input_data[];
+  if (USE_SMEM) {
+    for (int idx = threadIdx.x; idx < height * width; idx += blockDim.x) {
+      s_input_data[idx] = offset_input_data[idx];
+    }
+    __syncthreads();
+  }
+  for (int idx = threadIdx.x; idx < num_rois * pooled_height * pooled_width;
+       idx += blockDim.x) {
+    const int pw = idx % pooled_width;
+    const int ph = (idx / pooled_width) % pooled_height;
+    const int roi_idx = (idx / pooled_width / pooled_height) % num_rois;
+    const int n = batch * num_rois + roi_idx;
+    const float4 rois_offset = reinterpret_cast<const float4*>(input_rois)[n];
+    const T roi_xmin = rois_offset.x * spatial_scale;
+    const T roi_ymin = rois_offset.y * spatial_scale;
+    const T roi_xmax = rois_offset.z * spatial_scale;
+    const T roi_ymax = rois_offset.w * spatial_scale;
+    const T roi_width = max(roi_xmax - roi_xmin, static_cast<T>(1.f));
+    const T roi_height = max(roi_ymax - roi_ymin, static_cast<T>(1.f));
+    const T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    const T bin_size_w = roi_width / static_cast<T>(pooled_width);
+    const int roi_bin_grid_h = (sampling_ratio > 0)
+                                   ? sampling_ratio
+                                   : ceil(roi_height / pooled_height);
+    const int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+
+    T output_val = 0.f;
+    for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
+      const T y = roi_ymin + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
+        const T x = roi_xmin + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        if (USE_SMEM) {
+          T val = BilinearInterpolate<T>(s_input_data, height, width, y, x);
+          output_val += val;
+        } else {
+          T val =
+              BilinearInterpolate<T>(offset_input_data, height, width, y, x);
+          output_val += val;
+        }
+      }
+    }
+    output_val /= count;
+    const int out_offset =
+        batch * num_rois * channels * pooled_height * pooled_width +
+        roi_idx * channels * pooled_height * pooled_width +
+        channel * pooled_height * pooled_width + ph * pooled_width + pw;
+    output_data[out_offset] = static_cast<OutT>(output_val);
+  }
+}
+
+#if IS_TRT_VERSION_GE(6000)
+RoiAlignPluginDynamic::RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
+                                             const int pooled_height,
+                                             const int pooled_width,
+                                             float spatial_scale,
+                                             int sampling_ratio)
+    : data_type_(data_type),
+      pooled_height_(pooled_height),
+      pooled_width_(pooled_width),
+      spatial_scale_(spatial_scale),
+      sampling_ratio_(sampling_ratio) {
+  bool data_type_is_valid = data_type_ == nvinfer1::DataType::kFLOAT ||
+                            data_type_ == nvinfer1::DataType::kHALF;
+  PADDLE_ENFORCE_EQ(data_type_is_valid, true,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts kFLOAT(%d) or "
+                        "kHALF(%d) data type, but the received data type = %d",
+                        static_cast<int>(nvinfer1::DataType::kFLOAT),
+                        static_cast<int>(nvinfer1::DataType::kHALF),
+                        static_cast<int>(data_type_)));
+
+  PADDLE_ENFORCE_GT(pooled_height_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts pooled_height "
+                        "greater than %d, but the received pooled_height = %d",
+                        0, pooled_height_));
+
+  PADDLE_ENFORCE_GT(pooled_width_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts pooled_width greater "
+                        "than %d, but the received pooled_width = %d",
+                        0, pooled_height_));
+
+  PADDLE_ENFORCE_GT(spatial_scale_, 0.f,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlign plugin only accepts spatial_scale "
+                        "greater than %f, but the received spatial_scale = %f",
+                        0, spatial_scale_));
+
+  int smem_per_block = -1;
+  int device = -1;
+  cudaGetDevice(&device);
+
+  PADDLE_ENFORCE_GE(
+      device, 0,
+      platform::errors::InvalidArgument(
+          "The cuda device ID should be greater than %d, but device ID is %d",
+          0, device));
+
+  cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+                         device);
+  smem_per_block_ = smem_per_block;
+}
+
+RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &pooled_height_);
+  DeserializeValue(&data, &length, &pooled_width_);
+  DeserializeValue(&data, &length, &spatial_scale_);
+  DeserializeValue(&data, &length, &sampling_ratio_);
+  int smem_per_block = -1;
+  int device = -1;
+  cudaGetDevice(&device);
+  PADDLE_ENFORCE_GE(
+      device, 0,
+      platform::errors::InvalidArgument(
+          "The cuda device ID should be greater than %d, but device ID is %d",
+          0, device));
+  cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock,
+                         device);
+  smem_per_block_ = smem_per_block;
+}
+
+nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const {
+  auto* plugin =
+      new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_,
+                                spatial_scale_, sampling_ratio_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs RoiAlignPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) {
+  nvinfer1::DimsExprs ret{};
+  ret.nbDims = 4;
+  ret.d[0] = inputs[1].d[0];  // roi
+  ret.d[1] = inputs[0].d[1];  // X
+  ret.d[2] = exprBuilder.constant(pooled_height_);
+  ret.d[3] = exprBuilder.constant(pooled_width_);
+  return ret;
+}
+
+bool RoiAlignPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) {
+  if (inOut[pos].format != nvinfer1::TensorFormat::kLINEAR) {
+    return false;
+  }
+  if (pos < 2) {  // input
+    return inOut[pos].type == nvinfer1::DataType::kFLOAT;
+  }
+  return inOut[pos].type == data_type_;
+}
+
+void RoiAlignPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t RoiAlignPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return 0;
+}
+
+template <typename T, typename OutT>
+int RoiAlignPluginDynamic::enqueue_impl(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  auto in_dims = inputDesc[0].dims;
+  auto rois_dims = inputDesc[1].dims;
+  auto out_dims = outputDesc[0].dims;
+
+  int rois_num = rois_dims.d[0];
+  if (rois_num == 0) return cudaGetLastError() != cudaSuccess;
+
+  int batch = in_dims.d[0];
+  int channels = in_dims.d[1];
+  int height = in_dims.d[2];
+  int width = in_dims.d[3];
+
+  int output_size =
+      out_dims.d[0] * out_dims.d[1] * out_dims.d[2] * out_dims.d[3];
+
+  const dim3 blocks(batch, channels);
+  const int threads = 512;
+
+  if (smem_per_block_ < width * height * sizeof(T)) {
+    GPUROIAlignOpt<T, OutT, false><<<blocks, threads, 0, stream>>>(
+        output_size, static_cast<const T*>(inputs[0]),
+        static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
+        width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
+        static_cast<OutT*>(outputs[0]));
+  } else {
+    GPUROIAlignOpt<
+        T, OutT, true><<<blocks, threads, width * height * sizeof(T), stream>>>(
+        output_size, static_cast<const T*>(inputs[0]),
+        static_cast<const T*>(inputs[1]), spatial_scale_, channels, height,
+        width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch,
+        static_cast<OutT*>(outputs[0]));
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int RoiAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                                   const nvinfer1::PluginTensorDesc* outputDesc,
+                                   const void* const* inputs,
+                                   void* const* outputs, void* workspace,
+                                   cudaStream_t stream) {
+  PADDLE_ENFORCE_EQ(outputDesc[0].type, data_type_,
+                    platform::errors::InvalidArgument(
+                        "TRT RoiAlignPluginDynamic expects outputDesc[0].type "
+                        "equal to data_type_"));
+
+  if (data_type_ == nvinfer1::DataType::kHALF) {
+    return enqueue_impl<float, half>(inputDesc, outputDesc, inputs, outputs,
+                                     workspace, stream);
+  }
+  return enqueue_impl<float, float>(inputDesc, outputDesc, inputs, outputs,
+                                    workspace, stream);
+}
+
+nvinfer1::DataType RoiAlignPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+  return data_type_;
+}
+
+const char* RoiAlignPluginDynamic::getPluginType() const {
+  return "roi_align_plugin_dynamic";
+}
+
+int RoiAlignPluginDynamic::getNbOutputs() const { return 1; }
+
+int RoiAlignPluginDynamic::initialize() { return 0; }
+
+void RoiAlignPluginDynamic::terminate() {}
+
+size_t RoiAlignPluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(pooled_height_);
+  serialize_size += SerializedSize(pooled_width_);
+  serialize_size += SerializedSize(spatial_scale_);
+  serialize_size += SerializedSize(sampling_ratio_);
+  return serialize_size;
+}
+
+void RoiAlignPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, pooled_height_);
+  SerializeValue(&buffer, pooled_width_);
+  SerializeValue(&buffer, spatial_scale_);
+  SerializeValue(&buffer, sampling_ratio_);
+}
+
+void RoiAlignPluginDynamic::destroy() {}
+
+RoiAlignPluginDynamicCreator::RoiAlignPluginDynamicCreator() {}
+
+void RoiAlignPluginDynamicCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* RoiAlignPluginDynamicCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* RoiAlignPluginDynamicCreator::getPluginName() const {
+  return "roi_align_plugin_dynamic";
+}
+
+const char* RoiAlignPluginDynamicCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+RoiAlignPluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+  return nullptr;
+}
+
+nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new RoiAlignPluginDynamic(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..bba7d0d5a996641495fba5b8f406bdf37148babe
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit RoiAlignPluginDynamic(const nvinfer1::DataType data_type,
+                                 const int pooled_height,
+                                 const int pooled_width, float spatial_scale,
+                                 int sampling_ratio);
+  RoiAlignPluginDynamic(void const* data, size_t length);
+  ~RoiAlignPluginDynamic() = default;
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  template <typename T, typename OutT>
+  int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc,
+                   const nvinfer1::PluginTensorDesc* outputDesc,
+                   const void* const* inputs, void* const* outputs,
+                   void* workspace, cudaStream_t stream);
+
+  nvinfer1::DataType data_type_;
+  int pooled_height_;
+  int pooled_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  int smem_per_block_;
+  std::string namespace_;
+};
+
+class RoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  RoiAlignPluginDynamicCreator();
+  ~RoiAlignPluginDynamicCreator() override = default;
+
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+REGISTER_TRT_PLUGIN_V2(RoiAlignPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index 3b9eea22199d7b1669802fb506fb4218529b4468..7be9e3a740ab1c3532f5a67f06048c6c745eb214 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -54,11 +54,6 @@ void SkipLayerNormPluginDynamic::terminate() {
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {
-  PADDLE_ENFORCE_EQ(
-      inputs[0].nbDims, 5,
-      platform::errors::InvalidArgument(
-          "The Input dim of the SkipLayernorm should be 5, but it's (%d) now.",
-          inputs[0].nbDims));
   return inputs[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index 0e457fdc8f4474e4f7152aac3520193d70b22e65..ac621784550f2f15f69b17fb5fcbd61f32e2eba7 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -119,9 +119,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
   float eps_;
 };
 
-class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
+class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SkipLayerNormPluginV2Creator() {}
+  SkipLayerNormPluginDynamicCreator() {}
   const char* getPluginName() const override { return "skip_layernorm_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -156,7 +156,7 @@ class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::PluginFieldCollection field_collection_;
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
-REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index 340406c5e7fae8bf3f298228259e9fa33fc76887..9d4f9a35c3b6fe02981853eb3c0a697d5cb3a199 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -121,9 +121,9 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
   cudaStream_t copy_stream_;
 };
 
-class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
+class SlicePluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SlicePluginV2Creator() {}
+  SlicePluginDynamicCreator() {}
   const char* getPluginName() const override { return "slice_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -155,7 +155,7 @@ class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::PluginFieldCollection field_collection_;
 };
 
-REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SlicePluginDynamicCreator);
 
 #endif
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
index 250b944652b93c54ae9587271256b42c6e1bc6b7..fdb14f9ceaf29fe90cd756b77e7c5afff2296f44 100644
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -62,6 +62,8 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions(
   output.d[1] = one;
   output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB,
                                        *inputs[1].d[0], *one);
+  // remove padding 1
+  output.nbDims -= 2;
 
   return output;
 }
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 256aa28206ad1c21dc3245a6e78f7cdc59b29156..1b5c39f8fff855fac4ef8f2ee54faa872023ad05 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -22,11 +22,6 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
-  return new SplitPlugin(buffer, length);
-}
-REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
-
 template <typename T>
 __device__ int upper_bound(T const* vals, int n, T const& key) {
   int i = 0;
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 5c47ec3a990f584fd02b3515dbc642ffcd921709..1ee895154d6b046c6c18c2e374d3c63f1fcc5d62 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -25,7 +25,7 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-class SplitPlugin : public PluginTensorRT {
+class SplitPlugin : public PluginTensorRTV2Ext {
  public:
   SplitPlugin() {}
   SplitPlugin(int axis, std::vector<int> const& output_lengths, bool with_fp16)
@@ -39,13 +39,20 @@ class SplitPlugin : public PluginTensorRT {
     DeserializeValue(&serial_data, &serial_length, &output_length_);
   }
 
-  SplitPlugin* clone() const override {
-    auto* ptr = new SplitPlugin(axis_, output_length_, with_fp16_);
+  nvinfer1::IPluginV2Ext* clone() const override {
+    SplitPlugin* ptr = new SplitPlugin(axis_, output_length_, with_fp16_);
+    ptr->setPluginNamespace(this->getPluginNamespace());
     ptr->shareData(this);
     return ptr;
   }
 
-  const char* getPluginType() const override { return "split_plugin"; }
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const override {
+    return input_types[0];
+  }
+
+  const char* getPluginType() const override { return "split_plugin_v2ext"; }
   int getNbOutputs() const override { return output_length_.size(); }
   nvinfer1::Dims getOutputDimensions(int index,
                                      const nvinfer1::Dims* input_dims,
@@ -53,17 +60,18 @@ class SplitPlugin : public PluginTensorRT {
 
   int initialize() override;
   void terminate() override;
-  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
               void* workspace, cudaStream_t stream) override;
 
+  void destroy() override { delete this; }
+
  protected:
-  size_t getSerializationSize() override {
-    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
-           SerializedSize(output_length_) + getBaseSerializationSize();
+  size_t getSerializationSize() const override {
+    return SerializedSize(axis_) + SerializedSize(output_length_) +
+           getBaseSerializationSize();
   }
 
-  void serialize(void* buffer) override {
-    SerializeValue(&buffer, getPluginType());
+  void serialize(void* buffer) const override {
     serializeBase(buffer);
     SerializeValue(&buffer, axis_);
     SerializeValue(&buffer, output_length_);
@@ -83,6 +91,47 @@ class SplitPlugin : public PluginTensorRT {
   void shareData(const SplitPlugin* another);
 };
 
+class SplitPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  SplitPluginCreator() {}
+  const char* getPluginName() const override { return "split_plugin_v2ext"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    // not implemented
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new SplitPlugin(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(SplitPluginCreator);
+
 #if IS_TRT_VERSION_GE(6000)
 class SplitPluginDynamic : public DynamicPluginTensorRT {
  public:
@@ -144,9 +193,9 @@ class SplitPluginDynamic : public DynamicPluginTensorRT {
   std::vector<int> output_length_;
 };
 
-class SplitPluginV2Creator : public nvinfer1::IPluginCreator {
+class SplitPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SplitPluginV2Creator() {}
+  SplitPluginDynamicCreator() {}
   const char* getPluginName() const override { return "split_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -182,7 +231,7 @@ class SplitPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(SplitPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SplitPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
index 85cc6916238fefa028310b07e02301f10e07aefd..11579aadcc45731123770352ef08b362ff3ef745 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
@@ -127,9 +127,9 @@ class SwishPluginDynamic : public DynamicPluginTensorRT {
   float beta_;
 };
 
-class SwishPluginV2Creator : public nvinfer1::IPluginCreator {
+class SwishPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SwishPluginV2Creator() {}
+  SwishPluginDynamicCreator() {}
   const char* getPluginName() const override { return "swish_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -165,7 +165,7 @@ class SwishPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(SwishPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SwishPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6636513a555f9e638e1dfdb54986010c76785e2a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+TEST(split_op_plugin, test_plugin) {
+  int axis = 1;
+  std::vector<int> output_lengths{1, 1};
+  bool with_fp16 = false;
+  std::vector<nvinfer1::DataType> input_types{nvinfer1::DataType::kFLOAT};
+  std::vector<nvinfer1::Dims> input_dims;
+
+  SplitPlugin sp_plugin(axis, output_lengths, with_fp16);
+  nvinfer1::Dims in_dims;
+  in_dims.nbDims = 4;
+  input_dims.push_back(in_dims);
+  sp_plugin.configurePlugin(input_dims.data(), 1, nullptr, 2,
+                            input_types.data(), nullptr, nullptr, nullptr,
+                            nvinfer1::PluginFormat::kNCHW, 4);
+  sp_plugin.initialize();
+  sp_plugin.getPluginType();
+  sp_plugin.canBroadcastInputAcrossBatch(0);
+  sp_plugin.getNbOutputs();
+  auto clone_plugin = sp_plugin.clone();
+  clone_plugin->setPluginNamespace("test");
+  clone_plugin->destroy();
+  sp_plugin.getOutputDataType(0, input_types.data(), 1);
+  sp_plugin.terminate();
+}
+
+TEST(split_op_plugin, test_plugin_creater) {
+  SplitPluginCreator creator;
+  creator.getFieldNames();
+  creator.createPlugin("test", nullptr);
+  creator.setPluginNamespace("test");
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
index fd721b161450d7a8d4660ca09ea3a1093d754664..55bc786746beafcf7b2df98d54e9391e6a59ba24 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -19,27 +19,50 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+inline void Seria(void*& buffer,  // NOLINT
+                  const std::vector<nvinfer1::Dims>& input_dims,
+                  size_t max_batch_size, nvinfer1::DataType data_type,
+                  nvinfer1::PluginFormat data_format, bool with_fp16) {
+  SerializeValue(&buffer, input_dims);
+  SerializeValue(&buffer, max_batch_size);
+  SerializeValue(&buffer, data_type);
+  SerializeValue(&buffer, data_format);
+  SerializeValue(&buffer, with_fp16);
+}
+
+inline void Deseria(void const*& serial_data, size_t& serial_length,  // NOLINT
+                    std::vector<nvinfer1::Dims>* input_dims,
+                    size_t* max_batch_size, nvinfer1::DataType* data_type,
+                    nvinfer1::PluginFormat* data_format, bool* with_fp16) {
+  DeserializeValue(&serial_data, &serial_length, input_dims);
+  DeserializeValue(&serial_data, &serial_length, max_batch_size);
+  DeserializeValue(&serial_data, &serial_length, data_type);
+  DeserializeValue(&serial_data, &serial_length, data_format);
+  DeserializeValue(&serial_data, &serial_length, with_fp16);
+}
+
+inline size_t SeriaSize(const std::vector<nvinfer1::Dims>& input_dims,
+                        size_t max_batch_size, nvinfer1::DataType data_type,
+                        nvinfer1::PluginFormat data_format, bool with_fp16) {
+  return (SerializedSize(input_dims) + SerializedSize(max_batch_size) +
+          SerializedSize(data_type) + SerializedSize(data_format) +
+          SerializedSize(with_fp16));
+}
+
 void PluginTensorRT::serializeBase(void*& buffer) {
-  SerializeValue(&buffer, input_dims_);
-  SerializeValue(&buffer, max_batch_size_);
-  SerializeValue(&buffer, data_type_);
-  SerializeValue(&buffer, data_format_);
-  SerializeValue(&buffer, with_fp16_);
+  Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_,
+        with_fp16_);
 }
 
 void PluginTensorRT::deserializeBase(void const*& serial_data,
                                      size_t& serial_length) {
-  DeserializeValue(&serial_data, &serial_length, &input_dims_);
-  DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
-  DeserializeValue(&serial_data, &serial_length, &data_type_);
-  DeserializeValue(&serial_data, &serial_length, &data_format_);
-  DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+  Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_,
+          &data_type_, &data_format_, &with_fp16_);
 }
 
 size_t PluginTensorRT::getBaseSerializationSize() {
-  return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) +
-          SerializedSize(data_type_) + SerializedSize(data_format_) +
-          SerializedSize(with_fp16_));
+  return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_,
+                   with_fp16_);
 }
 
 bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
@@ -58,6 +81,35 @@ void PluginTensorRT::configureWithFormat(
   max_batch_size_ = max_batch_size;
 }
 
+void PluginTensorRTV2Ext::serializeBase(void*& buffer) const {
+  Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_,
+        with_fp16_);
+}
+
+void PluginTensorRTV2Ext::deserializeBase(void const*& serial_data,
+                                          size_t& serial_length) {
+  Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_,
+          &data_type_, &data_format_, &with_fp16_);
+}
+
+size_t PluginTensorRTV2Ext::getBaseSerializationSize() const {
+  return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_,
+                   with_fp16_);
+}
+
+void PluginTensorRTV2Ext::configurePlugin(
+    const nvinfer1::Dims* input_dims, int32_t nb_inputs,
+    const nvinfer1::Dims* output_dims, int32_t nb_outputs,
+    const nvinfer1::DataType* input_types,
+    const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
+    const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
+    int32_t max_batch_size) {
+  input_dims_.assign(input_dims, input_dims + nb_inputs);
+  max_batch_size_ = max_batch_size;
+  data_format_ = float_format;
+  data_type_ = input_types[0];
+}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index b3a3abe5d01fc53e2ef3da7722df0e372d605af4..ce3133ae99e94c62c0c8e958065700373d270037 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -44,6 +44,7 @@ typedef std::function<PluginTensorRT*(const void*, size_t)>
 
 typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
 
+// Deprecated. Do not inherit this class, please refer to PluginTensorRTV2Ext
 class PluginTensorRT : public nvinfer1::IPluginExt {
  public:
   PluginTensorRT() : with_fp16_(false) {}
@@ -119,6 +120,114 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
   bool with_fp16_;
 };
 
+// TensorRT introduced IPluginV2Ext after 5.1, Paddle no longer supports
+// versions before 5.1
+class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext {
+ public:
+  PluginTensorRTV2Ext() : with_fp16_(false) {}
+  PluginTensorRTV2Ext(const void* serialized_data, size_t length) {}
+
+  nvinfer1::Dims const& getInputDims(int index) const {
+    return input_dims_.at(index);
+  }
+  size_t getMaxBatchSize() const { return max_batch_size_; }
+  nvinfer1::DataType getDataType() const { return data_type_; }
+  nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
+
+  // The Func in IPluginV2Ext
+  virtual nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const = 0;
+
+  virtual bool isOutputBroadcastAcrossBatch(int32_t output_index,
+                                            const bool* input_is_broadcasted,
+                                            int32_t nb_inputs) const {
+    return false;
+  }
+
+  virtual bool canBroadcastInputAcrossBatch(int32_t input_index) const {
+    return false;
+  }
+
+  void configurePlugin(const nvinfer1::Dims* input_dims, int32_t nb_inputs,
+                       const nvinfer1::Dims* output_dims, int32_t nb_outputs,
+                       const nvinfer1::DataType* input_types,
+                       const nvinfer1::DataType* output_types,
+                       const bool* input_is_broadcast,
+                       const bool* output_is_broadcast,
+                       nvinfer1::PluginFormat float_format,
+                       int32_t max_batch_size) override;
+
+  virtual IPluginV2Ext* clone() const = 0;
+
+  void attachToContext(cudnnContext*, cublasContext*,
+                       nvinfer1::IGpuAllocator*) override {}
+
+  void detachFromContext() override {}
+
+  // The Func in IPluginV2
+  virtual const char* getPluginType() const = 0;
+  const char* getPluginVersion() const override { return "1"; }
+  virtual int32_t getNbOutputs() const { return 1; }
+  virtual nvinfer1::Dims getOutputDimensions(int32_t index,
+                                             const nvinfer1::Dims* inputs,
+                                             int32_t nb_input) = 0;
+  // Check format support. The default is FLOAT32 and NCHW.
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const override {
+    return ((type == nvinfer1::DataType::kFLOAT) &&
+            (format == nvinfer1::PluginFormat::kNCHW));
+  }
+  // Initialize the layer for execution.
+  // This is called when the engine is created.
+  int initialize() override { return 0; }
+
+  // Shutdown the layer. This is called when the engine is destroyed
+  void terminate() override {}
+
+  // Find the workspace size required by the layer
+  size_t getWorkspaceSize(int) const override { return 0; }
+
+  // Execute the layer
+  virtual int enqueue(int batch_size, const void* const* inputs, void** outputs,
+                      void* workspace, cudaStream_t stream) = 0;
+
+  // Find the size of the serialization buffer required
+  virtual size_t getSerializationSize() const = 0;
+
+  // Serialize the layer config to buffer.
+  // TensorRT will call this func to serialize the configuration of TensorRT
+  // engine. It should not be called by users.
+  virtual void serialize(void* buffer) const = 0;
+
+  virtual void destroy() = 0;
+
+  void setPluginNamespace(const char* plugin_namespace) override {
+    name_space_ = plugin_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return name_space_.c_str();
+  }
+
+ protected:
+  void deserializeBase(void const*& serial_data,  // NOLINT
+                       size_t& serial_length);    // NOLINT
+  size_t getBaseSerializationSize() const;
+  void serializeBase(void*& buffer) const;  // NOLINT
+
+ protected:
+  std::vector<nvinfer1::Dims> input_dims_;
+  size_t max_batch_size_;
+  nvinfer1::DataType data_type_;
+  nvinfer1::PluginFormat data_format_;
+  std::vector<nvinfer1::ITensor*> inputs_;
+  bool with_fp16_;
+
+ private:
+  std::string name_space_;
+};
+
 #if IS_TRT_VERSION_GE(6000)
 class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
  public:
@@ -184,6 +293,7 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
   std::string name_space_;
   std::string plugin_base_;
 };
+#endif
 
 template <typename T>
 class TrtPluginRegistrarV2 {
@@ -203,8 +313,6 @@ class TrtPluginRegistrarV2 {
   static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
       plugin_registrar_##name {}
 
-#endif
-
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
new file mode 100644
index 0000000000000000000000000000000000000000..13d07e774036a48b0ed6e3c91b168eaab4461df5
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -0,0 +1,401 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <cassert>
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h"
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type,
+                             const std::vector<int>& anchors,
+                             const int class_num, const float conf_thresh,
+                             const int downsample_ratio, const bool clip_bbox,
+                             const float scale_x_y, const int input_h,
+                             const int input_w)
+    : data_type_(data_type),
+      class_num_(class_num),
+      conf_thresh_(conf_thresh),
+      downsample_ratio_(downsample_ratio),
+      clip_bbox_(clip_bbox),
+      scale_x_y_(scale_x_y),
+      input_h_(input_h),
+      input_w_(input_w) {
+  anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend());
+  assert(data_type_ == nvinfer1::DataType::kFLOAT ||
+         data_type_ == nvinfer1::DataType::kHALF);
+  assert(class_num_ > 0);
+  assert(input_h_ > 0);
+  assert(input_w_ > 0);
+
+  cudaMalloc(&anchors_device_, anchors.size() * sizeof(int));
+  cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int),
+             cudaMemcpyHostToDevice);
+}
+
+YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchors_);
+  DeserializeValue(&data, &length, &class_num_);
+  DeserializeValue(&data, &length, &conf_thresh_);
+  DeserializeValue(&data, &length, &downsample_ratio_);
+  DeserializeValue(&data, &length, &clip_bbox_);
+  DeserializeValue(&data, &length, &scale_x_y_);
+  DeserializeValue(&data, &length, &input_h_);
+  DeserializeValue(&data, &length, &input_w_);
+}
+
+YoloBoxPlugin::~YoloBoxPlugin() {
+  if (anchors_device_ != nullptr) {
+    cudaFree(anchors_device_);
+    anchors_device_ = nullptr;
+  }
+}
+
+const char* YoloBoxPlugin::getPluginType() const { return "yolo_box_plugin"; }
+
+const char* YoloBoxPlugin::getPluginVersion() const { return "1"; }
+
+int YoloBoxPlugin::getNbOutputs() const { return 2; }
+
+nvinfer1::Dims YoloBoxPlugin::getOutputDimensions(int index,
+                                                  const nvinfer1::Dims* inputs,
+                                                  int nb_input_dims) {
+  const int anchor_num = anchors_.size() / 2;
+  const int box_num = inputs[0].d[1] * inputs[0].d[2] * anchor_num;
+
+  assert(index <= 1);
+
+  if (index == 0) {
+    return nvinfer1::Dims2(box_num, 4);
+  }
+  return nvinfer1::Dims2(box_num, class_num_);
+}
+
+bool YoloBoxPlugin::supportsFormat(nvinfer1::DataType type,
+                                   nvinfer1::TensorFormat format) const {
+  return ((type == data_type_ || type == nvinfer1::DataType::kINT32) &&
+          format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+size_t YoloBoxPlugin::getWorkspaceSize(int max_batch_size) const { return 0; }
+
+template <typename T>
+__device__ inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <>
+__device__ inline float sigmoid(float x) {
+  return 1.f / (1.f + expf(-x));
+}
+
+template <typename T>
+__device__ inline void GetYoloBox(float* box, const T* x, const int* anchors,
+                                  int i, int j, int an_idx, int grid_size_h,
+                                  int grid_size_w, int input_size_h,
+                                  int input_size_w, int index, int stride,
+                                  int img_height, int img_width, float scale,
+                                  float bias) {
+  box[0] = static_cast<float>(
+      (i + sigmoid(static_cast<float>(x[index]) * scale + bias)) * img_width /
+      grid_size_w);
+  box[1] = static_cast<float>(
+      (j + sigmoid(static_cast<float>(x[index + stride]) * scale + bias)) *
+      img_height / grid_size_h);
+  box[2] = static_cast<float>(expf(static_cast<float>(x[index + 2 * stride])) *
+                              anchors[2 * an_idx] * img_width / input_size_w);
+  box[3] =
+      static_cast<float>(expf(static_cast<float>(x[index + 3 * stride])) *
+                         anchors[2 * an_idx + 1] * img_height / input_size_h);
+}
+
+__device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
+                                    int an_num, int an_stride, int stride,
+                                    int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+__device__ inline void CalcDetectionBox(T* boxes, const float* box,
+                                        const int box_idx, const int img_height,
+                                        const int img_width, bool clip_bbox) {
+  float tmp_box_0, tmp_box_1, tmp_box_2, tmp_box_3;
+  tmp_box_0 = box[0] - box[2] / 2;
+  tmp_box_1 = box[1] - box[3] / 2;
+  tmp_box_2 = box[0] + box[2] / 2;
+  tmp_box_3 = box[1] + box[3] / 2;
+
+  if (clip_bbox) {
+    tmp_box_0 = max(tmp_box_0, 0.f);
+    tmp_box_1 = max(tmp_box_1, 0.f);
+    tmp_box_2 = min(tmp_box_2, static_cast<float>(img_width - 1));
+    tmp_box_3 = min(tmp_box_3, static_cast<float>(img_height - 1));
+  }
+
+  boxes[box_idx + 0] = static_cast<T>(tmp_box_0);
+  boxes[box_idx + 1] = static_cast<T>(tmp_box_1);
+  boxes[box_idx + 2] = static_cast<T>(tmp_box_2);
+  boxes[box_idx + 3] = static_cast<T>(tmp_box_3);
+}
+
+template <typename T>
+__device__ inline void CalcLabelScore(T* scores, const T* input,
+                                      const int label_idx, const int score_idx,
+                                      const int class_num, const float conf,
+                                      const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = static_cast<T>(
+        conf * sigmoid(static_cast<float>(input[label_idx + i * stride])));
+  }
+}
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* const input, const int* const imgsize,
+                            T* boxes, T* scores, const float conf_thresh,
+                            const int* anchors, const int n, const int h,
+                            const int w, const int an_num, const int class_num,
+                            const int box_num, int input_size_h,
+                            int input_size_w, bool clip_bbox, const float scale,
+                            const float bias) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  float box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    float conf = sigmoid(static_cast<float>(input[obj_idx]));
+    int box_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+
+    if (conf < conf_thresh) {
+      for (int i = 0; i < 4; ++i) {
+        box[i] = 0.f;
+      }
+    } else {
+      GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
+                    input_size_w, box_idx, grid_num, img_height, img_width,
+                    scale, bias);
+    }
+
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
+
+    int label_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
+                      grid_num);
+  }
+}
+
+template <typename T>
+int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs,
+                                void** outputs, void* workspace,
+                                cudaStream_t stream) {
+  const int n = batch_size;
+  const int h = input_h_;
+  const int w = input_w_;
+  const int an_num = anchors_.size() / 2;
+  const int box_num = h * w * an_num;
+  int input_size_h = downsample_ratio_ * h;
+  int input_size_w = downsample_ratio_ * w;
+
+  float bias = -0.5 * (scale_x_y_ - 1.);
+  constexpr int threads = 256;
+
+  KeYoloBoxFw<T><<<(n * box_num + threads - 1) / threads, threads, 0, stream>>>(
+      reinterpret_cast<const T* const>(inputs[0]),
+      reinterpret_cast<const int* const>(inputs[1]),
+      reinterpret_cast<T*>(outputs[0]), reinterpret_cast<T*>(outputs[1]),
+      conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num,
+      input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int YoloBoxPlugin::enqueue(int batch_size, const void* const* inputs,
+                           void** outputs, void* workspace,
+                           cudaStream_t stream) {
+  if (data_type_ == nvinfer1::DataType::kFLOAT) {
+    return enqueue_impl<float>(batch_size, inputs, outputs, workspace, stream);
+  } else if (data_type_ == nvinfer1::DataType::kHALF) {
+    return enqueue_impl<half>(batch_size, inputs, outputs, workspace, stream);
+  }
+  assert("unsupported type.");
+}
+
+int YoloBoxPlugin::initialize() { return 0; }
+
+void YoloBoxPlugin::terminate() {}
+
+size_t YoloBoxPlugin::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchors_);
+  serialize_size += SerializedSize(class_num_);
+  serialize_size += SerializedSize(conf_thresh_);
+  serialize_size += SerializedSize(downsample_ratio_);
+  serialize_size += SerializedSize(clip_bbox_);
+  serialize_size += SerializedSize(scale_x_y_);
+  serialize_size += SerializedSize(input_h_);
+  serialize_size += SerializedSize(input_w_);
+  return serialize_size;
+}
+
+void YoloBoxPlugin::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchors_);
+  SerializeValue(&buffer, class_num_);
+  SerializeValue(&buffer, conf_thresh_);
+  SerializeValue(&buffer, downsample_ratio_);
+  SerializeValue(&buffer, clip_bbox_);
+  SerializeValue(&buffer, scale_x_y_);
+  SerializeValue(&buffer, input_h_);
+  SerializeValue(&buffer, input_w_);
+}
+
+void YoloBoxPlugin::destroy() {}
+
+void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* YoloBoxPlugin::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+nvinfer1::DataType YoloBoxPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_type, int nb_inputs) const {
+  return data_type_;
+}
+
+bool YoloBoxPlugin::isOutputBroadcastAcrossBatch(int output_index,
+                                                 const bool* input_is_broadcast,
+                                                 int nb_inputs) const {
+  return false;
+}
+
+bool YoloBoxPlugin::canBroadcastInputAcrossBatch(int input_index) const {
+  return false;
+}
+
+void YoloBoxPlugin::configurePlugin(
+    const nvinfer1::Dims* input_dims, int nb_inputs,
+    const nvinfer1::Dims* output_dims, int nb_outputs,
+    const nvinfer1::DataType* input_types,
+    const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
+    const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
+    int max_batct_size) {}
+
+nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const {
+  return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_,
+                           downsample_ratio_, clip_bbox_, scale_x_y_, input_h_,
+                           input_w_);
+}
+
+YoloBoxPluginCreator::YoloBoxPluginCreator() {}
+
+void YoloBoxPluginCreator::setPluginNamespace(const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* YoloBoxPluginCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* YoloBoxPluginCreator::getPluginName() const {
+  return "yolo_box_plugin";
+}
+
+const char* YoloBoxPluginCreator::getPluginVersion() const { return "1"; }
+
+const nvinfer1::PluginFieldCollection* YoloBoxPluginCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+
+  int type_id = -1;
+  std::vector<int> anchors;
+  int class_num = -1;
+  float conf_thresh = 0.01;
+  int downsample_ratio = 32;
+  bool clip_bbox = true;
+  float scale_x_y = 1.;
+  int h = -1;
+  int w = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    if (field_name.compare("type_id") == 0) {
+      type_id = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("anchors")) {
+      const int length = fc->fields[i].length;
+      const int* data = static_cast<const int*>(fc->fields[i].data);
+      anchors.insert(anchors.end(), data, data + length);
+    } else if (field_name.compare("class_num")) {
+      class_num = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("conf_thresh")) {
+      conf_thresh = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("downsample_ratio")) {
+      downsample_ratio = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("clip_bbox")) {
+      clip_bbox = *static_cast<const bool*>(fc->fields[i].data);
+    } else if (field_name.compare("scale_x_y")) {
+      scale_x_y = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("h")) {
+      h = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("w")) {
+      w = *static_cast<const int*>(fc->fields[i].data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+
+  return new YoloBoxPlugin(
+      type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors,
+      class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w);
+}
+
+nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new YoloBoxPlugin(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ca21da7ae0377164cbb50c502f0abb5ca943058
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
@@ -0,0 +1,117 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class YoloBoxPlugin : public nvinfer1::IPluginV2Ext {
+ public:
+  explicit YoloBoxPlugin(const nvinfer1::DataType data_type,
+                         const std::vector<int>& anchors, const int class_num,
+                         const float conf_thresh, const int downsample_ratio,
+                         const bool clip_bbox, const float scale_x_y,
+                         const int input_h, const int input_w);
+  YoloBoxPlugin(const void* data, size_t length);
+  ~YoloBoxPlugin() override;
+
+  const char* getPluginType() const override;
+  const char* getPluginVersion() const override;
+  int getNbOutputs() const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::TensorFormat format) const override;
+  size_t getWorkspaceSize(int max_batch_size) const override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+  template <typename T>
+  int enqueue_impl(int batch_size, const void* const* inputs, void** outputs,
+                   void* workspace, cudaStream_t stream);
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_type,
+                                       int nb_inputs) const override;
+  bool isOutputBroadcastAcrossBatch(int output_index,
+                                    const bool* input_is_broadcast,
+                                    int nb_inputs) const override;
+  bool canBroadcastInputAcrossBatch(int input_index) const override;
+  void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs,
+                       const nvinfer1::Dims* output_dims, int nb_outputs,
+                       const nvinfer1::DataType* input_types,
+                       const nvinfer1::DataType* output_types,
+                       const bool* input_is_broadcast,
+                       const bool* output_is_broadcast,
+                       nvinfer1::PluginFormat float_format,
+                       int max_batct_size) override;
+  nvinfer1::IPluginV2Ext* clone() const override;
+
+ private:
+  nvinfer1::DataType data_type_;
+  std::vector<int> anchors_;
+  int* anchors_device_;
+  int class_num_;
+  float conf_thresh_;
+  int downsample_ratio_;
+  bool clip_bbox_;
+  float scale_x_y_;
+  int input_h_;
+  int input_w_;
+  std::string namespace_;
+};
+
+class YoloBoxPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  YoloBoxPluginCreator();
+  ~YoloBoxPluginCreator() override = default;
+
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(YoloBoxPluginCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 92f9c20a369d7a6e295b52d66fe61c244fafe943..f74cd671d6dca0cd52bb595f6ee1370b464d9e30 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -522,15 +522,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
     inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/)
-    inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
+    inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc
+    inference_analysis_test(test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
             
     set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz)
+    if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
@@ -538,7 +538,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
 
     set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
-    if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz)
+    if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz)
         inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc
@@ -576,8 +576,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
 
-    set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
     endif()
 
@@ -585,8 +584,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
-    set(TEST_TRT_ERNIE_UNSER_FP16_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_fp16_unserialized/")
-    if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_FP16_MODEL}/ernie_model_4_unserialized.tgz)
+    if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz")
     endif()
 
@@ -606,14 +604,23 @@ inference_analysis_test(lite_resnet50_test SRCS lite_resnet50_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 
-inference_analysis_test(test_analyzer_capi SRCS analyzer_capi_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-            ARGS --infer_model=${RESNET50_MODEL_DIR}/model)
+inference_analysis_test(test_analyzer_capi_exp SRCS analyzer_capi_exp_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${RESNET50_MODEL_DIR}/model)
+
+inference_analysis_test(test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
+
+inference_analysis_test(test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
 
-inference_analysis_test(test_analyzer_capi_pd_tensor SRCS analyzer_capi_pd_tensor_tester.cc
+if (NOT APPLE AND NOT WIN32)
+    inference_analysis_test(test_analyzer_capi_exp_pd_threads SRCS analyzer_capi_exp_pd_threads_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
-
+endif()
 inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zerocopy_tensor_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model)        
@@ -623,17 +630,17 @@ inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_t
             ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt)    
             
 if(WITH_MKLDNN)
-  inference_analysis_test(test_analyzer_capi_int SRCS analyzer_capi_int_tester.cc
+  inference_analysis_test(test_analyzer_capi_exp_int SRCS analyzer_capi_exp_int_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model)
- endif()
+endif()
 
-inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc 
+inference_analysis_test(test_analyzer_capi_exp_ner SRCS analyzer_capi_exp_ner_tester.cc 
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
         ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
 
 if(WITH_GPU)
-  inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
+    inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de9e2afd705f9366e3e703abab517b85be766018
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_Config, gpu_interface) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  std::string prog_file = model_dir + "/__model__";
+  std::string param_file = model_dir + "/__params__";
+  std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir";
+  const char* ops_name = "conv_2d";
+
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str());
+  PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str());
+
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  bool use_gpu = PD_ConfigUseGpu(config);
+  EXPECT_TRUE(use_gpu);
+  int init_size = PD_ConfigMemoryPoolInitSizeMb(config);
+  EXPECT_EQ(init_size, 100);
+  int gpu_device_id = PD_ConfigGpuDeviceId(config);
+  EXPECT_EQ(gpu_device_id, 0);
+  float frac = PD_ConfigFractionOfGpuMemoryForPool(config);
+  LOG(INFO) << frac;
+  PD_ConfigEnableCudnn(config);
+  bool cudnn = PD_ConfigCudnnEnabled(config);
+  EXPECT_TRUE(cudnn);
+
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE,
+                                TRUE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+
+  const char* tensor_name = "image";
+  size_t shapes_num[1] = {4};
+  int32_t min_shape[4] = {1, 3, 36, 36};
+  int32_t max_shape[4] = {1, 3, 224, 224};
+  int32_t opt_shape[4] = {1, 3, 224, 224};
+  int32_t* min_shape_ptr = min_shape;
+  int32_t* max_shape_ptr = max_shape;
+  int32_t* opt_shape_ptr = opt_shape;
+  PD_ConfigSetTrtDynamicShapeInfo(config, 1, &tensor_name, shapes_num,
+                                  &min_shape_ptr, &max_shape_ptr,
+                                  &opt_shape_ptr, FALSE);
+  PD_ConfigDisableTensorRtOPs(config, 1, &ops_name);
+  PD_ConfigEnableTensorRtOSS(config);
+  bool oss_enabled = PD_ConfigTensorRtOssEnabled(config);
+  EXPECT_TRUE(oss_enabled);
+
+  PD_ConfigEnableTensorRtDla(config, 4);
+  bool dla_enabled = PD_ConfigTensorRtDlaEnabled(config);
+  EXPECT_TRUE(dla_enabled);
+
+  PD_ConfigEnableGpuMultiStream(config);
+  bool thread_local_thread = PD_ConfigThreadLocalStreamEnabled(config);
+  EXPECT_TRUE(thread_local_thread);
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  int num_thread = PD_ConfigGetCpuMathLibraryNumThreads(config);
+  EXPECT_EQ(num_thread, 10);
+
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_ConfigSetOptimCacheDir(config,
+                            (FLAGS_infer_model + "/OptimCacheDir").c_str());
+  const char* model_dir_ = PD_ConfigGetModelDir(config);
+  LOG(INFO) << model_dir_;
+
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  bool use_gpu = PD_ConfigUseGpu(config);
+  EXPECT_TRUE(use_gpu);
+  int device_id = PD_ConfigGpuDeviceId(config);
+  EXPECT_EQ(device_id, 0);
+  int init_size = PD_ConfigMemoryPoolInitSizeMb(config);
+  EXPECT_EQ(init_size, 100);
+
+  float frac = PD_ConfigFractionOfGpuMemoryForPool(config);
+  LOG(INFO) << frac;
+
+  PD_ConfigEnableCudnn(config);
+  bool cudnn = PD_ConfigCudnnEnabled(config);
+  EXPECT_TRUE(cudnn);
+
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_ConfigIrOptim(config);
+  EXPECT_TRUE(ir_optim);
+
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_FLOAT32,
+                                FALSE, FALSE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_optim_enable);
+  PD_ConfigEnableProfile(config);
+  bool profiler_enable = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profiler_enable);
+  PD_ConfigSetInvalid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, trt_int8) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE,
+                                TRUE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, trt_fp16) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_HALF, FALSE,
+                                FALSE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_PredictorDestroy(predictor);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3a15cb285772d5cc75a460c388223c8da663119
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void predictor_run() {
+  std::string model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  LOG(INFO) << "The inputs' size is: " << input_names->size;
+  EXPECT_EQ(input_names->size, 2u);
+
+  int32_t shape_0[4] = {1, 3, 224, 224};
+  float data_0[1 * 3 * 224 * 224] = {0};
+  PD_Tensor* input_0 = PD_PredictorGetInputHandle(predictor, "image");
+  PD_TensorReshape(input_0, 4, shape_0);
+  PD_TensorCopyFromCpuFloat(input_0, data_0);
+  int32_t shape_1[2] = {1, 1};
+  int64_t data_1[1] = {0};
+  PD_Tensor* input_1 = PD_PredictorGetInputHandle(predictor, "label");
+  PD_TensorReshape(input_1, 2, shape_1);
+  PD_TensorCopyFromCpuInt64(input_1, data_1);
+
+  LOG(INFO) << "Run Inference in CAPI encapsulation. ";
+  EXPECT_TRUE(PD_PredictorRun(predictor));
+
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  LOG(INFO) << "output size is: " << output_names->size;
+  for (size_t index = 0; index < output_names->size; ++index) {
+    LOG(INFO) << "output[" << index
+              << "]'s name is: " << output_names->data[index];
+    PD_Tensor* output =
+        PD_PredictorGetOutputHandle(predictor, output_names->data[index]);
+    PD_OneDimArrayInt32* shape = PD_TensorGetShape(output);
+    LOG(INFO) << "output[" << index << "]'s shape_size is: " << shape->size;
+    int32_t out_size = 1;
+    for (size_t i = 0; i < shape->size; ++i) {
+      LOG(INFO) << "output[" << index << "]'s shape is: " << shape->data[i];
+      out_size = out_size * shape->data[i];
+    }
+    float* out_data = new float[out_size];
+    PD_TensorCopyToCpuFloat(output, out_data);
+    LOG(INFO) << "output[" << index << "]'s DATA is: " << out_data[0];
+    delete[] out_data;
+    PD_OneDimArrayInt32Destroy(shape);
+    PD_TensorDestroy(output);
+  }
+  PD_PredictorClearIntermediateTensor(predictor);
+  PD_PredictorTryShrinkMemory(predictor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(input_1);
+  PD_TensorDestroy(input_0);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(PD_PredictorRun, predictor_run) { predictor_run(); }
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4369cd78dfa3746677d0916ad3a5c106da412ff0
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_PredictorRun, predictor_run) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/param").c_str());
+  PD_ConfigDisableGpu(config);
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  size_t input_num = PD_PredictorGetInputNum(predictor);
+  LOG(INFO) << "Input num: " << input_num;
+  size_t output_num = PD_PredictorGetOutputNum(predictor);
+  LOG(INFO) << "Output num: " << output_num;
+
+  PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor);
+  EXPECT_EQ(input_names->size, 2u);
+  LOG(INFO) << "Predictor start run!";
+  PD_Tensor *inputs[2];
+  inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]);
+  LOG(INFO) << "Predictor start run!";
+  // inputs[0]: word, use lod memory in stack
+  int32_t shape_0[2] = {11, 1};
+  int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
+  size_t lod_layer_0[2] = {0, 11};
+  PD_OneDimArraySize layer_0;
+  layer_0.size = 2;
+  layer_0.data = lod_layer_0;
+  PD_OneDimArraySize *layer_0_ptr = &layer_0;
+  PD_TwoDimArraySize lod_0;
+  lod_0.size = 1;
+  lod_0.data = &layer_0_ptr;
+  PD_TensorReshape(inputs[0], 2, shape_0);
+  PD_TensorCopyFromCpuInt64(inputs[0], data_0);
+  PD_TensorSetLod(inputs[0], &lod_0);
+
+  // inputs[1]: mention, use lod memory in heap
+  int32_t shape_1[2] = {11, 1};
+  int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
+  PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize();
+  lod_1_ptr->size = 1;
+  lod_1_ptr->data = new PD_OneDimArraySize *[1];
+  lod_1_ptr->data[0] = new PD_OneDimArraySize();
+  lod_1_ptr->data[0]->size = 2;
+  lod_1_ptr->data[0]->data = new size_t[2];
+  lod_1_ptr->data[0]->data[0] = 0;
+  lod_1_ptr->data[0]->data[1] = 11;
+
+  PD_TensorReshape(inputs[1], 2, shape_1);
+  PD_TensorCopyFromCpuInt64(inputs[1], data_1);
+  PD_TensorSetLod(inputs[1], lod_1_ptr);
+  // retrieve the lod memory
+  delete[] lod_1_ptr->data[0]->data;
+  delete lod_1_ptr->data[0];
+  delete[] lod_1_ptr->data;
+  delete lod_1_ptr;
+  lod_1_ptr = nullptr;
+
+  LOG(INFO) << "Predictor start run!";
+  bool success = PD_PredictorRun(predictor);
+  EXPECT_TRUE(success);
+  LOG(INFO) << "Predictor run success!";
+  PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor *output =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output);
+
+  PD_TwoDimArraySizeDestroy(output_lod);
+  PD_TensorDestroy(output);
+  PD_OneDimArrayCstrDestroy(output_names);
+
+  PD_TensorDestroy(inputs[0]);
+  PD_TensorDestroy(inputs[1]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18107704ae420f5eacbbac885f77dc7ed042b73f
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_Config, interface) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  std::string prog_file = model_dir + "/__model__";
+  std::string param_file = model_dir + "/__params__";
+  std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir";
+
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  std::string model_dir_ = PD_ConfigGetModelDir(config);
+  EXPECT_EQ(model_dir, model_dir_);
+
+  PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str());
+  PD_ConfigSetProgFile(config, prog_file.c_str());
+  PD_ConfigSetParamsFile(config, param_file.c_str());
+  PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str());
+  std::string prog_file_ = PD_ConfigGetProgFile(config);
+  std::string param_file_ = PD_ConfigGetParamsFile(config);
+  EXPECT_EQ(prog_file, prog_file_);
+  EXPECT_EQ(param_file, param_file_);
+
+  PD_ConfigDisableFCPadding(config);
+  bool fc_padding = PD_ConfigUseFcPadding(config);
+  EXPECT_FALSE(fc_padding);
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_ConfigIrOptim(config);
+  EXPECT_TRUE(ir_optim);
+
+#ifndef PADDLE_WITH_LITE
+  PD_ConfigEnableLiteEngine(config, PD_PRECISION_FLOAT32, TRUE, 0, nullptr, 0,
+                            nullptr);
+  bool lite_enabled = PD_ConfigLiteEngineEnabled(config);
+  EXPECT_TRUE(lite_enabled);
+#endif
+
+  PD_ConfigSwitchIrDebug(config, TRUE);
+#ifdef PADDLE_WITH_MKLDNN
+  const char* ops_name = "conv_2d";
+  PD_ConfigEnableMKLDNN(config);
+  PD_ConfigSetMkldnnOp(config, 1, &ops_name);
+  PD_ConfigSetMkldnnCacheCapacity(config, 100);
+  bool mkldnn_enabled = PD_ConfigMkldnnEnabled(config);
+  EXPECT_TRUE(mkldnn_enabled);
+
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  int32_t cpu_threads = PD_ConfigGetCpuMathLibraryNumThreads(config);
+  EXPECT_EQ(cpu_threads, 10);
+
+  PD_ConfigEnableMkldnnQuantizer(config);
+  bool mkldnn_qt_enabled = PD_ConfigMkldnnQuantizerEnabled(config);
+  EXPECT_TRUE(mkldnn_qt_enabled);
+
+  PD_ConfigEnableMkldnnBfloat16(config);
+  PD_ConfigSetBfloat16Op(config, 1, &ops_name);
+  bool mkldnn_bf16_enabled = PD_ConfigMkldnnBfloat16Enabled(config);
+  EXPECT_TRUE(mkldnn_bf16_enabled);
+#endif
+
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_enabled = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_enabled);
+
+  PD_ConfigEnableProfile(config);
+  bool profile_enabled = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profile_enabled);
+
+  PD_ConfigDisableGlogInfo(config);
+  bool glog_diabled = PD_ConfigGlogInfoDisabled(config);
+  EXPECT_TRUE(glog_diabled);
+
+  PD_ConfigSetInvalid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+
+  PD_ConfigPartiallyRelease(config);
+  PD_ConfigDestroy(config);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4017fc5a7f3408b99bb664d52192f7cb35f9144
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void PD_run() {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<float> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuFloat(tensor, input.data());
+  PD_TensorDataFloat(tensor, &place, &size);
+  PD_TensorMutableDataFloat(tensor, place);
+
+  PD_TwoDimArraySize lod;
+  lod.size = 0;
+  lod.data = NULL;
+  PD_TensorSetLod(tensor, &lod);
+
+  PD_PredictorRun(predictor);
+
+  std::vector<float> out_data;
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor);
+  int32_t out_num = std::accumulate(output_shape->data,
+                                    output_shape->data + output_shape->size, 1,
+                                    std::multiplies<int32_t>());
+  out_data.resize(out_num);
+  PD_TensorCopyToCpuFloat(output_tensor, out_data.data());
+  LOG(INFO) << "Output tensor name is: " << PD_TensorGetName(output_tensor);
+  PD_DataType data_type = PD_TensorGetDataType(output_tensor);
+  EXPECT_EQ(data_type, PD_DATA_FLOAT32);
+
+  PD_TwoDimArraySize* out_lod = PD_TensorGetLod(output_tensor);
+
+  PD_TwoDimArraySizeDestroy(out_lod);
+  PD_OneDimArrayInt32Destroy(output_shape);
+  PD_TensorDestroy(output_tensor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+TEST(PD_Tensor, PD_run) { PD_run(); }
+
+TEST(PD_Tensor, int32) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<int32_t> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuInt32(tensor, input.data());
+  int32_t* data_ptr = PD_TensorDataInt32(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  int32_t* mutable_data_ptr = PD_TensorMutableDataInt32(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_INT32);
+  PD_TensorCopyToCpuInt32(tensor, input.data());
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Tensor, int64) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<int64_t> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuInt64(tensor, input.data());
+  int64_t* data_ptr = PD_TensorDataInt64(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  int64_t* mutable_data_ptr = PD_TensorMutableDataInt64(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_INT64);
+  PD_TensorCopyToCpuInt64(tensor, input.data());
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Tensor, uint8) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  uint8_t input[1 * 3 * 300 * 300] = {0};
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuUint8(tensor, input);
+  uint8_t* data_ptr = PD_TensorDataUint8(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  uint8_t* mutable_data_ptr = PD_TensorMutableDataUint8(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_UINT8);
+  PD_TensorCopyToCpuUint8(tensor, input);
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+std::string read_file(std::string filename) {
+  std::ifstream file(filename);
+  return std::string((std::istreambuf_iterator<char>(file)),
+                     std::istreambuf_iterator<char>());
+}
+
+TEST(PD_Tensor, from_buffer) {
+  PD_Config* config = PD_ConfigCreate();
+  std::string prog_file = FLAGS_infer_model + "/__model__";
+  std::string params_file = FLAGS_infer_model + "/__params__";
+
+  std::string prog_str = read_file(prog_file);
+  std::string params_str = read_file(params_file);
+
+  PD_ConfigSetModelBuffer(config, prog_str.c_str(), prog_str.size(),
+                          params_str.c_str(), params_str.size());
+
+  bool model_from_memory = PD_ConfigModelFromMemory(config);
+  EXPECT_TRUE(model_from_memory);
+  PD_ConfigDestroy(config);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8951c446b1f83a952abaf55767a3cea52d8f0463
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+typedef struct RunParameter {
+  PD_Predictor* predictor;
+  int32_t* shapes;
+  size_t shape_size;
+  float* input_data;
+  int32_t out_size;
+  float* out_data;
+  int32_t thread_index;
+} RunParameter;
+
+void* run(void* thread_param) {
+  struct RunParameter* param = (struct RunParameter*)thread_param;
+  LOG(INFO) << "Thread " << param->thread_index << " start run!";
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(param->predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(param->predictor, input_names->data[0]);
+  PD_TensorReshape(tensor, param->shape_size, param->shapes);
+  PD_TensorCopyFromCpuFloat(tensor, param->input_data);
+  PD_PredictorRun(param->predictor);
+  PD_OneDimArrayCstr* output_names =
+      PD_PredictorGetOutputNames(param->predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(param->predictor, output_names->data[0]);
+  PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor);
+  param->out_size = 1;
+  for (size_t index = 0; index < output_shape->size; ++index) {
+    param->out_size = param->out_size * output_shape->data[index];
+  }
+  PD_OneDimArrayInt32Destroy(output_shape);
+  param->out_data =
+      reinterpret_cast<float*>(malloc(param->out_size * sizeof(float)));
+  PD_TensorCopyToCpuFloat(output_tensor, param->out_data);
+  PD_TensorDestroy(output_tensor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  LOG(INFO) << "Thread " << param->thread_index << " end run!";
+  return NULL;
+}
+void threads_run(int thread_num) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+
+  pthread_t* threads =
+      reinterpret_cast<pthread_t*>(malloc(thread_num * sizeof(pthread_t)));
+  RunParameter* params = reinterpret_cast<RunParameter*>(
+      malloc(thread_num * sizeof(RunParameter)));
+  int32_t shapes[4] = {1, 3, 300, 300};
+  float* input =
+      reinterpret_cast<float*>(malloc(1 * 3 * 300 * 300 * sizeof(float)));
+  memset(input, 0, 1 * 3 * 300 * 300 * sizeof(float));
+  for (int i = 0; i < thread_num; ++i) {
+    params[i].predictor = PD_PredictorClone(predictor);
+    params[i].shapes = shapes;
+    params[i].shape_size = 4;
+    params[i].input_data = input;
+    params[i].out_size = 0;
+    params[i].out_data = NULL;
+    params[i].thread_index = i;
+    pthread_create(&(threads[i]), NULL, run, (params + i));
+  }
+  for (int i = 0; i < thread_num; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+  ASSERT_GT(params[0].out_size, 0);
+
+  for (int i = 1; i < thread_num; ++i) {
+    ASSERT_EQ(params[i].out_size, params[0].out_size);
+    for (int j = 0; j < params[i].out_size; ++j) {
+      ASSERT_EQ(params[i].out_data[j], params[0].out_data[j]);
+    }
+  }
+  for (int i = 0; i < thread_num; ++i) {
+    PD_PredictorDestroy(params[i].predictor);
+    free(params[i].out_data);
+  }
+  free(input);
+  free(params);
+  free(threads);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Predictor, PD_multi_threads_run) { threads_run(10); }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11de1a5a6fab4f38bb57ff2af451d55658d0cbca
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void predictor_run() {
+  std::string model_dir = FLAGS_infer_model;
+  std::string prog_file = model_dir + "/model";
+  std::string params_file = model_dir + "/params";
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  PD_Tensor *tensor = PD_PredictorGetInputHandle(predictor, "data");
+
+  const int batch_size = 1;
+  const int channels = 3;
+  const int height = 318;
+  const int width = 318;
+  float *input = new float[batch_size * channels * height * width]();
+
+  int32_t shape[4] = {batch_size, channels, height, width};
+  PD_TensorReshape(tensor, 4, shape);
+  PD_TensorCopyFromCpuFloat(tensor, input);
+  EXPECT_TRUE(PD_PredictorRun(predictor));
+
+  delete[] input;
+  PD_TensorDestroy(tensor);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_PredictorRun, predictor_run) { predictor_run(); }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(PD_Config, profile_mkldnn) {
+  std::string model_dir = FLAGS_infer_model;
+  std::string prog_file = model_dir + "/model";
+  std::string params_file = model_dir + "/params";
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigEnableMKLDNN(config);
+  bool mkldnn_enable = PD_ConfigMkldnnEnabled(config);
+  EXPECT_TRUE(mkldnn_enable);
+  PD_ConfigEnableMkldnnQuantizer(config);
+  bool quantizer_enable = PD_ConfigMkldnnQuantizerEnabled(config);
+  EXPECT_TRUE(quantizer_enable);
+  PD_ConfigEnableMkldnnBfloat16(config);
+  PD_ConfigSetMkldnnCacheCapacity(config, 0);
+  PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+  PD_ConfigDestroy(config);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4fd04e85840de8fdf7afa2903fe52b023bab644
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#ifdef PADDLE_WITH_XPU
+TEST(PD_Config, use_xpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config *config = PD_Config();
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_ConfigSetOptimCacheDir(config,
+                            (FLAGS_infer_model + "/OptimCacheDir").c_str());
+  const char *model_dir_ = PD_ConfigGetModelDir(config);
+  LOG(INFO) << model_dir_;
+  PD_ConfigEnableXpu(config, 0xfffc00);
+  bool use_xpu = PD_ConfigUseXpu(config);
+  EXPECT_TRUE(use_xpu);
+  int32_t device_id = PD_ConfigXpuDeviceId(config);
+  EXPECT_EQ(devive_id, 0);
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_IrOptim(config);
+  EXPECT_TRUE(ir_optim);
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_optim_enable);
+  PD_ConfigEnableProfile(config);
+  bool profiler_enable = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profiler_enable);
+  PD_SetInValid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+  PD_ConfigDestroy(config);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
index 6d4bb70df6f3ad5a214543dacbc5d0d1864a67b1..9211ea246a5c5e0cdc75e6fef72ae0e4e40d69af 100644
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -75,14 +75,15 @@ int test_predictor_zero_copy(const AnalysisConfig& config_in,
   }
 
   std::vector<float> input({1});
-  auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())};
+  auto in_tensor =
+      predictor->GetInputTensor(predictor->GetInputNames().front());
   in_tensor->Reshape({1, 1});
   in_tensor->copy_from_cpu(input.data());
 
   predictor->ZeroCopyRun();
 
-  auto out_tensor{
-      predictor->GetOutputTensor(predictor->GetOutputNames().front())};
+  auto out_tensor =
+      predictor->GetOutputTensor(predictor->GetOutputNames().front());
   std::vector<float> data_o(10);
   out_tensor->copy_to_cpu(data_o.data());
 
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 377ea37677389923cabc71dfaf62fd2b11ab4f7c..2ea047fa13c10596995916234ef67e8a276b6b22 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -27,12 +27,18 @@ if (WITH_ROCM)
   cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
 endif()
 
+if (WITH_ASCEND_CL)
+  cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
+endif()
+
 cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
 
 if (WITH_GPU OR WITH_ROCM)
     set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
 elseif(WITH_XPU)
     set(AllocatorFacadeDeps xpu_info)
+elseif(WITH_ASCEND)
+    set(AllocatorFacadeDeps ascend_npu_info)
 else ()
     set(AllocatorFacadeDeps)
 endif()
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index cbeb263b5f41b96c73d67d9f56a407eecf209815..730efa5c646885026eee1e472205ce723b0fcb1b 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -32,6 +32,7 @@
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
+#include "paddle/fluid/platform/npu_info.h"
 
 DEFINE_int64(
     gpu_allocator_retry_time, 10000,
@@ -66,6 +67,11 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+        for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
+        }
 #endif
         break;
       }
@@ -185,6 +191,12 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
   class ZeroSizeAllocator : public Allocator {
    public:
     explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index c1b12f5c0ecbb6e4b367be0eb0ea9730b9f14ea6..b1a45afa99d9a565bfc3b8b3e6192eca7d2ccd05 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -54,6 +54,7 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   size_t avail, total, actual_avail, actual_total;
   bool is_limited = platform::RecordedCudaMemGetInfo(
       &avail, &total, &actual_avail, &actual_total, place_.device);
+  size_t allocated = total - avail;
 
   std::string err_msg;
   if (is_limited) {
@@ -68,13 +69,14 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
 
   PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
       "\n\nOut of memory error on GPU %d. "
-      "Cannot allocate %s memory on GPU %d, "
+      "Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
       "available memory is only %s.\n\n"
       "Please check whether there is any other process using GPU %d.\n"
       "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
       "2. If no, please decrease the batch size of your model. %s\n\n",
       place_.device, string::HumanReadableSize(size), place_.device,
-      string::HumanReadableSize(avail), place_.device, err_msg));
+      string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
+      place_.device, err_msg));
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 0ada2cafcc16a638cba2e8dbd8d36ce1b219d0b5..3e88d61783c9e67053ef065f61fef5cf991a9b25 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -19,7 +19,10 @@
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
+#include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #include "paddle/fluid/string/printf.h"
@@ -110,6 +113,7 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+// For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 #ifdef PADDLE_WITH_XPU
@@ -219,6 +223,135 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #endif
 }
 
+// For Ascend NPU
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUBuddyAllocatorList {
+ private:
+  NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) {
+    auto npu_num = devices_.size();
+    allocators_.resize(npu_num);
+    init_flags_.reserve(npu_num);
+    for (size_t i = 0; i < npu_num; ++i) {
+      init_flags_.emplace_back(new std::once_flag());
+    }
+  }
+
+  static NPUBuddyAllocatorList *CreateNewInstance() {
+    return new NPUBuddyAllocatorList();
+  }
+
+ public:
+  static NPUBuddyAllocatorList *Instance() {
+    static auto *instance = CreateNewInstance();
+    return instance;
+  }
+
+  BuddyAllocator *Get(int npu_id) {
+    auto pos = std::distance(
+        devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
+    PADDLE_ENFORCE_LT(pos, devices_.size(),
+                      platform::errors::OutOfRange(
+                          "The index exceeds the size of devices, the size of "
+                          "devices is %d, the index is %d",
+                          devices_.size(), pos));
+
+    std::call_once(*init_flags_[pos], [this, pos] {
+      platform::SetNPUDeviceId(devices_[pos]);
+      allocators_[pos].reset(new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::NPUAllocator(devices_[pos])),
+          platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()));
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
+               << "to change the memory size for GPU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_gpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
+    });
+
+    return allocators_[pos].get();
+  }
+
+ private:
+  std::vector<int> devices_;
+  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
+  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetNPUBuddyAllocator(int npu_id) {
+  return NPUBuddyAllocatorList::Instance()->Get(npu_id);
+}
+#endif
+
+template <>
+size_t Used<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto *buddy_allocator = GetNPUBuddyAllocator(place.device);
+  auto *ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    platform::NPUDeviceGuard(place.device);
+    size_t avail, total;
+    platform::NPUMemoryUsage(&avail, &total);
+    PADDLE_THROW(platform::errors::ResourceExhausted(
+        "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
+        "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
+        string::HumanReadableSize(size), place.device,
+        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
+        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
+        string::HumanReadableSize(Used<platform::NPUPlace>(place))));
+  } else {
+    if (FLAGS_init_allocated_mem) {
+      aclrtMemset(ptr, size, 0xEF, size);
+    }
+  }
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetNPUBuddyAllocator(place.device)->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::NPUPlace>(const platform::NPUPlace &place) {
+#ifdef PADDLE_WITH_ASCEND_CL
+  return GetNPUBuddyAllocator(place.device)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'NPUPlace' is not supported in CPU only device."));
+#endif
+}
+
+// For CUDA
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUBuddyAllocatorList {
  private:
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
index 37da748ee9c965ab829851c7045ad6a1a1e0e93e..1fe85dd699acf18387482d296c2c30f3bb2415cb 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -61,6 +61,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NaiveBestFitAllocatorTest, NpuAlloc) {
+  NaiveBestFitAllocator alloc{platform::NPUPlace(0)};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  sleep(10);
+  alloc.Release(platform::NPUPlace(0));
+
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::NPUPlace(0));
+}
+#endif
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..faf7ae6221caaffeb3266b67c409b4bf61f476f0
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/npu_allocator.h"
+#include <string>
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool NPUAllocator::IsAllocThreadSafe() const { return true; }
+void NPUAllocator::FreeImpl(Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
+      platform::errors::PermissionDenied(
+          "NPU memory is freed in incorrect device. This may be a bug"));
+  platform::RecordedNPUFree(allocation->ptr(), allocation->size(),
+                            place_.device);
+  delete allocation;
+}
+
+Allocation* NPUAllocator::AllocateImpl(size_t size) {
+  std::call_once(once_flag_,
+                 [this] { platform::SetNPUDeviceId(place_.device); });
+
+  void* ptr;
+  auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device);
+  if (LIKELY(result == ACL_ERROR_NONE)) {
+    return new Allocation(ptr, size, platform::Place(place_));
+  }
+
+  size_t avail, total, actual_avail, actual_total;
+  bool is_limited = platform::RecordedNPUMemGetInfo(
+      &avail, &total, &actual_avail, &actual_total, place_.device);
+
+  std::string err_msg;
+  if (is_limited) {
+    auto limit_size = (total >> 20);
+    err_msg = string::Sprintf(
+        "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger "
+        "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum "
+        "GPU memory usage is limited to %d MB.\n"
+        "   The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+        limit_size, limit_size);
+  }
+
+  PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+      "\n\nOut of memory error on NPU %d. "
+      "Cannot allocate %s memory on NPU %d, "
+      "available memory is only %s.\n\n"
+      "Please check whether there is any other process using NPU %d.\n"
+      "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+      "2. If no, please decrease the batch size of your model. %s\n\n",
+      place_.device, string::HumanReadableSize(size), place_.device,
+      string::HumanReadableSize(avail), place_.device, err_msg));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf668973505bab0b00b2da6111709e27236ffea6
--- /dev/null
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class NPUAllocator : public Allocator {
+ public:
+  explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {}
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(Allocation* allocation) override;
+  Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::NPUPlace place_;
+  std::once_flag once_flag_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt
index fcae741db3667f4acf9ff33323f3f95710724669..e9631ee739b9b8089a963a6aa84a9837010ad639 100644
--- a/paddle/fluid/memory/detail/CMakeLists.txt
+++ b/paddle/fluid/memory/detail/CMakeLists.txt
@@ -6,6 +6,8 @@ if(WITH_GPU)
   nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
 elseif(WITH_ROCM)
   hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
+elseif(${WITH_ASCEND_CL})
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place)
 else()
   cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
 endif()
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 50c0b58f3a1dd6eafd4ca86f2378cbd8f4b2e041..55436f451a41ff2a77acddfaff3c5a7c290b7ac2 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -21,6 +21,9 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+#endif
 
 namespace paddle {
 namespace memory {
@@ -235,6 +238,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
     }
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the allocation size for gpu for the first allocation.
+      allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes);
+    } else {
+      // Compute the re-allocation size, we store the re-allocation size when
+      // user set FLAGS_reallocate_gpu_memory_in_mb to fix value.
+      if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) {
+        realloc_size_ = platform::NPUReallocSize();
+      }
+      allocate_bytes = std::max(realloc_size_, request_bytes);
+    }
+  }
+#endif
 
   // Allocate a new block
   void* p = system_allocator_->Alloc(&index, allocate_bytes);
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 15e93deffccda8852b371a60ab3e08f9f8b811c2..135c3b6d04f346d361530ad5586e8f11e023d05c 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
index 2dc3e73af24162ebdc7872403fe28d83044920dc..290f3d5d1bcd47b40b8ee35ad45cd103bd11b26e 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -19,14 +19,16 @@ limitations under the License. */
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
+#include <fstream>
+#include <string>
+
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include <fstream>
-#include <string>
-
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 DECLARE_double(fraction_of_gpu_memory_to_use);
 DECLARE_uint64(initial_gpu_memory_in_mb);
 DECLARE_uint64(reallocate_gpu_memory_in_mb);
@@ -342,6 +344,32 @@ TEST(BuddyAllocator, Release) {
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(BuddyAllocator, NpuFraction) {
+  // In a 16 GB machine, the pool size will be about 160 MB
+  FLAGS_fraction_of_gpu_memory_to_use = 0.005;
+  FLAGS_fraction_of_gpu_memory_to_use = 0.92;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new NPUAllocator(0)),
+      platform::NPUMinChunkSize(), platform::NPUMaxChunkSize());
+
+  // Less than pool size
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 20);
+  buddy_allocator.Release();
+
+  // Greater than max chunk size
+  TestBuddyAllocator(&buddy_allocator, 300 << 20,
+                     /* use_system_allocator = */ true);
+  TestBuddyAllocator(&buddy_allocator, 1 * static_cast<size_t>(1 << 30),
+                     /* use_system_allocator = */ true);
+}
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 38baf6c24bab3fb7ca55a15b4f231bf9eba7d82e..0d7065d8bfba0e4ba6f443a3f9e87ee0e1a825a6 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -29,6 +29,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -123,6 +125,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     size_t avail, total, actual_avail, actual_total;
     bool is_limited = platform::RecordedCudaMemGetInfo(
         &avail, &total, &actual_avail, &actual_total, gpu_id_);
+    size_t allocated = total - avail;
 
     std::string err_msg;
     if (is_limited) {
@@ -137,7 +140,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "\n\nOut of memory error on GPU %d. "
-        "Cannot allocate %s memory on GPU %d, "
+        "Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
         "available memory is only %s.\n\n"
         "Please check whether there is any other process using GPU %d.\n"
         "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
@@ -148,8 +151,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
         "      The command is "
         "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
         gpu_id_, string::HumanReadableSize(size), gpu_id_,
-        string::HumanReadableSize(avail), gpu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
+        gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
   }
 }
 
@@ -247,6 +250,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
 
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+void* NPUAllocator::Alloc(size_t* index, size_t size) {
+  if (size <= 0) return nullptr;
+
+  void* p;
+  auto result = platform::RecordedNPUMalloc(&p, size, npu_id_);
+
+  if (result == ACL_ERROR_NONE) {
+    *index = 0;
+    npu_alloc_size_ += size;
+    return p;
+  } else {
+    size_t avail, total, actual_avail, actual_total;
+    bool is_limited = platform::RecordedNPUMemGetInfo(
+        &avail, &total, &actual_avail, &actual_total, npu_id_);
+
+    std::string err_msg;
+    if (is_limited) {
+      auto limit_size = (total >> 20);
+      err_msg = string::Sprintf(
+          "\n   3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
+          "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
+          "maximum GPU memory usage is limited to %d MB.\n"
+          "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
+          limit_size, limit_size);
+    }
+
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on NPU %d. "
+        "Cannot allocate %s memory on NPU %d, "
+        "available memory is only %s.\n\n"
+        "Please check whether there is any other process using NPU %d.\n"
+        "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n"
+        "2. If no, please try one of the following suggestions:\n"
+        "   1) Decrease the batch size of your model.\n"
+        "   2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
+        "please set it to a higher value but less than 1.0.\n"
+        "      The command is "
+        "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
+        npu_id_, string::HumanReadableSize(size), npu_id_,
+        string::HumanReadableSize(avail), npu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+  }
+}
+
+void NPUAllocator::Free(void* p, size_t size, size_t index) {
+  VLOG(4) << "Free " << p << " size " << size;
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(npu_alloc_size_, size,
+                    platform::errors::InvalidArgument(
+                        "The size of memory (%d) to free exceeds the size of "
+                        "allocated gpu memory (%d)",
+                        size, npu_alloc_size_));
+  npu_alloc_size_ -= size;
+
+  platform::RecordedNPUFree(p, size, npu_id_);
+}
+
+bool NPUAllocator::UseGpu() const { return true; }
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h
index e332bb670da2357f5ed831e743c20579677b90a5..26711ae4070f5ed72f77519b196c4c354cb049e1 100644
--- a/paddle/fluid/memory/detail/system_allocator.h
+++ b/paddle/fluid/memory/detail/system_allocator.h
@@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator {
 };
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class NPUAllocator : public SystemAllocator {
+ public:
+  explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {}
+
+  virtual void* Alloc(size_t* index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t npu_alloc_size_ = 0;
+  int npu_id_;
+};
+#endif
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc
index 13854d771a0bf60bfef90515795ee70d9cb7fb73..ead188341dac46bd3eec490015ff934dc8a26af5 100644
--- a/paddle/fluid/memory/detail/system_allocator_test.cc
+++ b/paddle/fluid/memory/detail/system_allocator_test.cc
@@ -85,3 +85,11 @@ TEST(GPUAllocator, AllocFailure) {
   }
 }
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+TEST(NPUAllocator, Alloc) {
+  paddle::memory::detail::NPUAllocator a(0);
+  TestAllocator(&a, 1 << 20);
+  TestAllocator(&a, 1);
+}
+#endif
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 7f871fab5a1470b32dfd44376b7cf30e1b94656a..730d49e8acd93022e6e46f7285b9548ed7a5c6d8 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -40,7 +40,7 @@ void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
                                                   platform::CPUPlace src_place,
                                                   const void* src, size_t num) {
   if (num <= 0) {
-    VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
+    VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
   int dev_id = -1;
@@ -86,7 +86,7 @@ void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
                                                   platform::XPUPlace src_place,
                                                   const void* src, size_t num) {
   if (num <= 0) {
-    VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
+    VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
     return;
   }
   int dev_id = -1;
@@ -132,7 +132,7 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
                                                   platform::XPUPlace src_place,
                                                   const void* src, size_t num) {
   if (num <= 0) {
-    VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
+    VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
     return;
   }
   int dev_id = -1;
@@ -196,6 +196,106 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+template <>
+void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(dst_place.device);
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
+  } else {
+    // On NPU, async operation after sync operation is ok, while sync operation
+    // after async is not ok, since the async operation may not done.
+    // So, its needed to do wait before sync operation.
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
+    platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
+  }
+}
+
+template <>
+void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::NPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  platform::SetNPUDeviceId(src_place.device);
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
+
+  if (stream) {
+    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
+    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
+  } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
+
+    platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
+    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
+  }
+}
+
+template <>
+void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::NPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  aclrtStream stream) {
+  if (UNLIKELY(num == 0)) return;
+
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by stream(" << stream << ")";
+  if (dst_place == src_place) {
+    platform::SetNPUDeviceId(src_place.device);
+    if (stream) {
+      platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU");
+      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+                               stream);
+    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
+      platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
+      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
+    }
+  } else {
+    if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Peer access between NPU places is not allowed."));
+    }
+    if (stream) {
+      // TODO(zhiqiu): support peer access?
+      platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU");
+      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+                               stream);
+    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
+      platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
+      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
+    }
+  }
+}
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h
index 25490f28b659876ddd1e9e0eef7f23062791a05e..c630437224cd093438df5d8d8a58a5c8f6ab2ad2 100644
--- a/paddle/fluid/memory/memcpy.h
+++ b/paddle/fluid/memory/memcpy.h
@@ -52,7 +52,27 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
           gpuStream_t stream);
+#endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU or NPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU or NPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ * \param[in]  stream   NPU stream.
+ *
+ * \note    For NPU memory copy, NPU stream need to be specified
+ *          for asynchronously memory copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          aclrtStream stream);
 #endif
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 467a5ff9063a65bd7905edd0b9818aa600d595bf..6e11c64afc4bd813362640e151203d4dd700fea5 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -10,6 +10,7 @@ file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists
 copy_if_different(${pybind_file} ${pybind_file_final})
 
 add_subdirectory(math)
+add_subdirectory(eigen)
 add_subdirectory(controlflow)
 add_subdirectory(detection)
 add_subdirectory(elementwise)
@@ -41,6 +42,10 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
+if (WITH_DLNNE)
+    add_subdirectory(dlnne)
+endif()
+
 if (WITH_LITE)
     add_subdirectory(lite)
 endif()
@@ -68,7 +73,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
+register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
         sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
@@ -110,8 +115,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function)
 if (WITH_GPU OR WITH_ROCM)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function)
 endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
@@ -121,6 +127,12 @@ if (WITH_ASCEND)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} ascend_wrapper)
 endif()
 
+if (WITH_ASCEND_CL)
+  cc_test(assign_op_npu_test SRCS assign_op_npu_test.cc DEPS assign_op)
+  cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner)
+endif()
+
 # FIXME(typhoonzero): operator deps may not needed.
 # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
 # op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
@@ -134,8 +146,8 @@ set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS})
 set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies")
 
 cc_test(test_common_infer_shape_functions SRCS test_common_infer_shape_functions.cc DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op elementwise_add_op softmax_op softmax)
-cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
@@ -154,12 +166,22 @@ endif()
 cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS})
 if (WITH_PYTHON)
   cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind)
+  cc_library(py_layer_op SRCS py_layer_op.cc DEPS op_registry python pybind)
+endif()
+
+if (WITH_ASCEND_CL)
+  cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
+  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op)
 endif()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 add_subdirectory(benchmark)
 
 cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op)
+if (WITH_ASCEND_CL)
+    cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor)
+endif()
+
 
 if(WITH_MKLDNN)
 include(mkldnn/inplace_op_tests.cmake)
@@ -173,3 +195,11 @@ if(WITH_UNITY_BUILD)
     # The specified link dependency needs to be displayed here.
     target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS})
 endif()
+
+if(WITH_ASCEND_CL)
+cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
+endif()
+
+if (WITH_GPU OR WITH_ASCEND_CL)
+cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry copy_cross_scope_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu
index e373d628f6cbd6b5ee48edc984a68d2767ce0593..97409e6cb1b17b8fc109e30dc78720b8d573f042 100644
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -13,44 +13,79 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/abs_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Enable = void>
+struct CudaAbsFunctor;
+
+template <typename T>
+struct CudaAbsFunctor<T, math::Complex<T, math::Real<T>>> {
+  __device__ __forceinline__ math::Real<T> operator()(const T* args) const {
+    return abs(args[0]);
+  }
+};
+
+template <typename T>
+struct CudaAbsFunctor<T, math::NoComplex<T, math::Real<T>>> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return std::abs(args[0]);
+  }
+};
+
+template <typename T>
+class AbsKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    out->mutable_data<math::Real<T>>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    std::vector<const framework::Tensor*> ins = {x};
+    std::vector<framework::Tensor*> outs = {out};
+    auto functor = CudaAbsFunctor<T>();
+    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, math::Real<T>>(
+        dev_ctx, ins, &outs, functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
-    abs, ops::AbsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex64>,
-    ops::AbsKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::complex128>);
+    abs, ops::AbsKernel<plat::CUDADeviceContext, float>,
+    ops::AbsKernel<plat::CUDADeviceContext, double>,
+    ops::AbsKernel<plat::CUDADeviceContext, int>,
+    ops::AbsKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex64>,
+    ops::AbsKernel<plat::CUDADeviceContext, plat::complex128>);
 
 REGISTER_OP_CUDA_KERNEL(
-    abs_grad, ops::AbsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex64>,
-    ops::AbsGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::complex128>);
+    abs_grad, ops::AbsGradKernel<plat::CUDADeviceContext, float>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, double>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, int>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex64>,
+    ops::AbsGradKernel<plat::CUDADeviceContext, plat::complex128>);
 
 REGISTER_OP_CUDA_KERNEL(
-    abs_grad_grad,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::float16>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex64>,
-    ops::AbsDoubleGradKernel<paddle::platform::CUDADeviceContext,
-                             paddle::platform::complex128>);
+    abs_grad_grad, ops::AbsDoubleGradKernel<plat::CUDADeviceContext, float>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, double>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex64>,
+    ops::AbsDoubleGradKernel<plat::CUDADeviceContext, plat::complex128>);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 94f2eb3672bd5d06f4a3f310cdad39119c336a0f..055909ba6f486ff82220c2d36c54687091bde9ed 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -162,6 +162,12 @@ $$out = \\frac{1}{1 + e^{-x}}$$
 
 )DOC";
 
+UNUSED constexpr char SiluDoc[] = R"DOC(
+Silu Activation Operator
+
+$$out = x * \\frac{1}{1 + e^{-x}}$$
+)DOC";
+
 UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
@@ -697,6 +703,7 @@ It is recommended to use the defaults for this activation.
 };
 
 REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(Silu, SiluDoc);
 REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
 REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
@@ -782,6 +789,26 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("tanh_grad_grad");
+    // input1: Out
+    op->SetInput("Out", this->Input("Out"));
+    // input2: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    // output: ddy
+    op->SetOutput("DOutNew", this->InputGrad("Out"));
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 // ReluGrad: dx = dy if y >= 0 else 0
 // ReluGradGrad: ddy = ddx if y >= 0 else 0
 template <typename T>
@@ -1041,6 +1068,34 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
 
+/* ==========================    tanh register  ============================= */
+REGISTER_OPERATOR(
+    tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::TanhGradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::TanhGradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    std::conditional<ops::CanInplaceAct<ops::TanhGradFunctor<float>>(),
+                     ops::ActFwdInplaceInferer, void>::type);
+REGISTER_OPERATOR(tanh_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::TanhDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::TanhDoubleGradMaker<paddle::imperative::OpBase>)
+REGISTER_OPERATOR(
+    tanh_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::TanhGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
+
+REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
+REGISTER_OP_CPU_KERNEL(
+    tanh_grad_grad, ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
+                                              ops::TanhGradGradFunctor<float>>,
+    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
+                              ops::TanhGradGradFunctor<double>>,
+    ops::TanhDoubleGradKernel<plat::CPUDeviceContext,
+                              ops::TanhGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ==========================    relu register  ============================= */
 REGISTER_OPERATOR(
     relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType,
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 2033081af224a4e938a7b4f0f619729feea57506..618f17031b1ef3b4b96ea72b05f9f63edd01c794 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -10,32 +10,1409 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // relu(x) = max(x, 0)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] > zero ? args[0] : zero;
+  }
+};
+
+template <typename T>
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // dx = dout * (out > 0)
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[1] > zero ? args[0] : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // leakyrelu(x) = x > 0 ? x : alpha * x
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] > zero ? args[0] : static_cast<T>(alpha) * args[0];
+  }
+};
+
+template <typename T>
+struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // dx = dout * (x > 0 ? 1 : alpha)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[1] > zero ? args[0] : static_cast<T>(alpha) * args[0];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // sigmoid(x) = 1 / (1 + exp(-x))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(one / (one + exp(-x)));
+  }
+};
+
+template <typename T>
+struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * out * (1 - out)
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * args[1] * (one - args[1]);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaSiluFunctor : public BaseActivationFunctor<T> {
+  // MPType means Compute Type
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // silu(x) = x / (1 + exp(-x))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(x / (one + exp(-x)));
+  }
+};
+
+template <typename T>
+struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType temp = one / (one + exp(-x));
+    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // logsigmoid(x) = log(1 / (1 + exp(-x)))
+  // For numerical stability,
+  // logsigmoid(x) =
+  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType temp = x > zero ? zero : -x;
+    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
+  }
+};
+
+template <typename T>
+struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // dx = dout * exp(-x) / (1 + exp(-x))
+  // For numerical stability:
+  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
+  // 0)))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType temp1 = x > zero ? zero : -x;
+    MPType temp2 = exp(-x - temp1);
+    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // atan(x) = atan(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(atan(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (one + args[1] * args[1]);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // softshrink(x) = x - lambda, if x > lambda;
+  //                 x + lambda, if x < -lambda;
+  //                 0, otherwise.
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T l = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(x > l);
+    T temp2 = static_cast<T>(x < -l);
+    return temp1 * (x - l) + temp2 * (x + l);
+  }
+};
+
+template <typename T>
+struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // dx = dout, if x > lambda or x < -lambda else 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[1];
+    T l = static_cast<T>(lambda);
+    return (x >= -l && x <= l) ? zero : args[0];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaCeilFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // ceil(x) = ceil(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(ceil(x));
+  }
+};
+
+template <typename T>
+struct CudaFloorFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // floor(x) = floor(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(floor(x));
+  }
+};
+
+template <typename T>
+struct CudaRoundFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // round(x) = round(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(round(x));
+  }
+};
+
+// grad functor for ceil, floor and round
+template <typename T>
+struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return static_cast<T>(0.0f);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
+};
+
+template <typename T>
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // cos(x) = cos(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * (-sin(x))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(-dout * sin(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // sin(x) = sin(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(sin(x));
+  }
+};
+
+template <typename T>
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * cos(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * cos(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // tan(x) = tan(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout / cos(x)^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // asin(x) = asin(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // acos(x) = acos(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(-dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // cosh(x) = cosh(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * sinh(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // sinh(x) = sinh(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * cosh(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // tanh(x) = tanh(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * (1 - out^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T dout = static_cast<T>(args[0]);
+    T out = static_cast<T>(args[1]);
+    return dout * (one - out * out);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // reciprocal(x) = 1 / x
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return one / args[0];
+  }
+};
+
+template <typename T>
+struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  // dx = -dout * out^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return -args[0] * args[1] * args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaExpFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // exp(x) = exp(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(exp(x));
+  }
+};
+
+template <typename T>
+struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout * out
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaLogFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // log(x) = log(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(log(x));
+  }
+};
+
+template <typename T>
+struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout / x
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSquareFunctor : public BaseActivationFunctor<T> {
+  // square(x) = x * x
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * args[0];
+  }
+};
+
+template <typename T>
+struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
+  T two = static_cast<T>(2.0f);
+
+  // dx = dout * 2 * x
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * two * args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // sqrt(x) = sqrt(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(sqrt(x));
+  }
+};
+
+template <typename T>
+struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
+  T one_half = static_cast<T>(0.5f);
+
+  // dx = dout * 0.5 / out
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return one_half * args[0] / args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // rsqrt(x) = rsqrt(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(rsqrt(x));
+  }
+};
+
+template <typename T>
+struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
+  T minus_one_half = static_cast<T>(-0.5f);
+
+  // dx = dout * -0.5 / out^3
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T out = args[1];
+    return minus_one_half * args[0] * out * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // log1p(x) = log(1 + x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(log(one + x));
+  }
+};
+
+template <typename T>
+struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (one + args[1]);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaLog2Functor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // log2(x) = log2(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(log2(x));
+  }
+};
+
+template <typename T>
+struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
+
+  // dx = dout / (x * log(2))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (args[1] * log_two);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaLog10Functor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // log10(x) = log10(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(log10(x));
+  }
+};
+
+template <typename T>
+struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
+
+  // dx = dout / (x * log(10))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (args[1] * log_ten);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaBReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // brelu(x) = min(max(x, t_min), t_max)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    T temp_max = x > t_min_cast ? x : t_min_cast;
+    T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaBReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float t_min;
+  float t_max;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  // dx = (x > t_min && x < t_max) ? dout : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T dout = args[0];
+    T x = args[1];
+    T t_min_cast = static_cast<T>(t_min);
+    T t_max_cast = static_cast<T>(t_max);
+    return (x > t_min_cast && x < t_max_cast) ? dout : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold)))
+  // Inputs: args[0], the input x
+  // threshold should not be negative
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType t = static_cast<MPType>(threshold);
+    MPType temp_min = x < t ? x : t;
+    MPType temp_max = temp_min > -t ? temp_min : -t;
+    return static_cast<T>(log(one + exp(temp_max)));
+  }
+};
+
+template <typename T>
+struct CudaSoftReluGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  // threshold should not be negative
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType out = static_cast<MPType>(args[1]);
+    MPType t = static_cast<MPType>(threshold);
+    return (out > -t && out < t) ? static_cast<T>(dout * (one - exp(-out)))
+                                 : static_cast<T>(0.0f);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaSTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  float scale_a;
+  float scale_b;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  // stanh(x) = b * tanh(a * x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType a = static_cast<MPType>(scale_a);
+    MPType b = static_cast<MPType>(scale_b);
+    return static_cast<T>(b * tanh(a * x));
+  }
+};
+
+template <typename T>
+struct CudaSTanhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float scale_a;
+  float scale_b;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType a = static_cast<MPType>(scale_a);
+    MPType b = static_cast<MPType>(scale_b);
+    MPType temp = tanh(a * x);
+    return static_cast<T>(dout * a * b * (one - temp * temp));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSoftplusFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType b = static_cast<MPType>(beta);
+    MPType t = static_cast<MPType>(threshold);
+    MPType x_beta = x * beta;
+    return static_cast<T>(x_beta > t ? x : log(one + exp(x_beta)) / b);
+  }
+};
+
+template <typename T>
+struct CudaSoftplusGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}, {"threshold", &threshold}};
+  }
+
+  // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType b = static_cast<MPType>(beta);
+    MPType t = static_cast<MPType>(threshold);
+    MPType x_beta = x * beta;
+    return x_beta > t ? args[0] : static_cast<T>(dout / (one + exp(-x_beta)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSoftsignFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // softsign(x) = x / (1 + abs(x))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (one + abs(args[0]));
+  }
+};
+
+template <typename T>
+struct CudaSoftsignGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + abs(x))^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T temp = one + abs(args[1]);
+    return args[0] / (temp * temp);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaRelu6Functor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // relu6(x) = min(max(0, x), 6)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T t = static_cast<T>(threshold);
+    return args[0] <= zero ? zero : (args[0] < t ? args[0] : t);
+  }
+};
+
+template <typename T>
+struct CudaRelu6GradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (out > 0 && out < t) ? dout : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T t = static_cast<T>(threshold);
+    return (args[1] > zero && args[1] < t) ? args[0] : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaTanhShrinkFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // tanhshrink(x) = x - tanh(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(x - tanh(x));
+  }
+};
+
+template <typename T>
+struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * tanh(x)^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * tanh(x) * tanh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaHardShrinkFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : x;
+  }
+};
+
+template <typename T>
+struct CudaHardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = (x > -threshold && x < threshold) ? 0 : dout
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[1];
+    T t = static_cast<T>(threshold);
+    return (x > -t && x < t) ? zero : args[0];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaHardSigmoidFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // hard_sigmoid(x) = 0, when x <= -3
+  //                   1, when x >= 3
+  //                   x * slope + offset, otherwise
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T temp = args[0] * static_cast<T>(slope) + static_cast<T>(offset);
+    T temp_max = temp > zero ? temp : zero;
+    T temp_min = temp_max < one ? temp_max : one;
+    return temp_min;
+  }
+};
+
+template <typename T>
+struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  float slope;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  // dx = (out > 0 && out < 1) ? dout * slope : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T out = args[1];
+    return (out > zero && out < one) ? args[0] * static_cast<T>(slope) : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+template <typename T>
+struct CudaSwishFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  // swish(x) = x / (1 + exp(-beta * x))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType b = static_cast<MPType>(beta);
+    return static_cast<T>(x / (one + exp(-b * x)));
+  }
+};
+
+template <typename T>
+struct CudaSwishGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float beta;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType b = static_cast<MPType>(beta);
+    MPType temp1 = one / (one + exp(-b * x));
+    MPType out = x * temp1;
+    MPType temp2 = b * out;
+    MPType temp3 = temp1 * (one - temp2);
+    return static_cast<T>(dout * (temp2 + temp3));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // thresholded_relu(x) = x > threshold ? x : 0
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] > static_cast<T>(threshold) ? args[0] : zero;
+  }
+};
+
+template <typename T>
+struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  // dx = x > threshold ? dout : 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[1] > static_cast<T>(threshold) ? args[0] : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaHardSwishFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float threshold;
+  float scale;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
+  }
+
+  // hard_swish(x) = 0, when x <= -offset
+  //                 x , when x >= threshold - offset
+  //                 x * (x + offset) / scale, otherwise
+  // threshold = scale = 6, offset = 3 by default
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T t = static_cast<T>(threshold);
+    T temp = x + static_cast<T>(offset);
+    T temp_max = temp > zero ? temp : zero;
+    T temp_min = temp_max < t ? temp_max : t;
+    return temp_min * x / static_cast<T>(scale);
+  }
+};
+
+template <typename T>
+struct CudaHardSwishGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  T one = static_cast<T>(1.0f);
+  T two = static_cast<T>(2.0f);
+  float threshold;
+  float scale;
+  float offset;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}};
+  }
+
+  // dx = 0, when x <= -offset
+  //      dout , when x >= threshold - offset
+  //      dout * (2 * x / scale + offset / scale), otherwise
+  // threshold = scale = 6, offset = 3 by default
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[1];
+    T o = static_cast<T>(offset);
+    T s = static_cast<T>(scale);
+    T temp1 = static_cast<T>(x + o > zero);
+    T temp2 = static_cast<T>(x + o < static_cast<T>(threshold));
+    return args[0] * (temp1 * temp2 * (two * x + o) / s + one - temp2);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaELUFunctor : public BaseActivationFunctor<T> {
+  using CT = typename details::MPTypeTrait<T>::Type;
+  CT zero = static_cast<CT>(0.0f);
+  CT one = static_cast<CT>(1.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // elu(x) = max(0, x) + min(0, alpha * (exp(x) - 1))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    CT x = static_cast<CT>(args[0]);
+    CT temp = static_cast<CT>(alpha) * (exp(x) - one);
+    CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp);
+    return static_cast<T>(res);
+  }
+};
+
+template <typename T>
+struct CudaELUGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+  MPType one = static_cast<MPType>(1.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // dx = dout, if alpha > 0 and x > 0
+  // dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0
+  // dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0
+  // dx = 0, if alpha <= 0 and x <=0
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType a = static_cast<MPType>(alpha);
+    MPType temp_a_pos = static_cast<MPType>(alpha > 0.0f);
+    MPType temp_a_neg = static_cast<MPType>(alpha <= 0.0f);
+    MPType temp_x_pos = static_cast<MPType>(x > zero);
+    MPType temp_x_neg = static_cast<MPType>(x <= zero);
+    return static_cast<T>(
+        dout * (temp_a_pos * temp_x_pos + temp_a_pos * temp_x_neg * a * exp(x) +
+                temp_a_neg * temp_x_pos * (one + a * exp(x))));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename DeviceContext, typename Functor>
+class ActivationCudaKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor* x = nullptr;
+    framework::Tensor* out = nullptr;
+    ExtractActivationTensor(ctx, &x, &out);
+    out->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    std::vector<const framework::Tensor*> ins = {x};
+    std::vector<framework::Tensor*> outs = {out};
+    auto functor = Functor();
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(dev_ctx, ins,
+                                                               &outs, functor);
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class ActivationGradCudaKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor *x, *out, *d_out;
+    framework::Tensor* d_x = nullptr;
+    x = out = d_out = nullptr;
+    ExtractActivationGradTensor<Functor::FwdDeps()>(ctx, &x, &out, &d_out,
+                                                    &d_x);
+    d_x->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto functor = Functor();
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+
+    std::vector<const framework::Tensor*> ins = {d_out};
+    std::vector<framework::Tensor*> outs = {d_x};
+
+    if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
+      // Only need forward output Out
+      ins.push_back(out);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, functor);
+    } else if (static_cast<int>(Functor::FwdDeps()) ==
+               static_cast<int>(kDepX)) {
+      // Only need forward input X
+      ins.push_back(x);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+          dev_ctx, ins, &outs, functor);
+    } else {
+      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T, T>(
+          dev_ctx, ins, &outs, functor);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,         \
-                                        grad_functor)                       \
-  REGISTER_OP_CUDA_KERNEL(                                                  \
-      act_type,                                                             \
-      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>,  \
-      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<double>>, \
-      ops::ActivationKernel<plat::CUDADeviceContext,                        \
-                            ops::functor<plat::float16>>);                  \
-  REGISTER_OP_CUDA_KERNEL(                                                  \
-      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
-                                                 ops::grad_functor<float>>, \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>,                 \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<plat::float16>>);
-
-FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,            \
+                                        grad_functor)                          \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
+                                          ops::functor<float>>,                \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
+                                ops::functor<double>>,                         \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::float16>>);                 \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type##_grad,                                                         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<float>>,                 \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<double>>,                \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::float16>>);
+
+#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor,        \
+                                            grad_functor)                      \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
+                                          ops::functor<float>>,                \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
+                                ops::functor<double>>,                         \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
+                                ops::functor<int>>,                            \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
+                                ops::functor<int64_t>>,                        \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::float16>>);                 \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type##_grad,                                                         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<float>>,                 \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<double>>,                \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<int>>,                   \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<int64_t>>,               \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::float16>>);
 
 /* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
-                                LeakyReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
+                                CudaLeakyReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     leaky_relu_grad_grad,
@@ -48,7 +1425,7 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ======================== elu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, CudaELUFunctor, CudaELUGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
@@ -60,7 +1437,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
+                                CudaReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,
@@ -72,8 +1450,23 @@ REGISTER_OP_CUDA_KERNEL(
                                     ops::ReluGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
+/* ===========================    tanh register  ============================ */
+REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
+                                CudaTanhGradFunctor);
+
+REGISTER_OP_CUDA_KERNEL(
+    tanh_grad_grad,
+    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::TanhGradGradFunctor<float>>,
+    ops::TanhDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                              ops::TanhGradGradFunctor<double>>,
+    ops::TanhDoubleGradKernel<plat::CUDADeviceContext,
+                              ops::TanhGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ===========================   sqrt register  ============================= */
-REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
+                                CudaSqrtGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     sqrt_grad_grad,
@@ -87,7 +1480,8 @@ REGISTER_OP_CUDA_KERNEL(
 
 /* ===========================   rsqrt register  =============================
  */
-REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, CudaRsqrtFunctor,
+                                CudaRsqrtGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     rsqrt_grad_grad,
@@ -100,25 +1494,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================  square register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    square,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<float>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<double>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<int>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<int64_t>>,
-    ops::ActivationKernel<plat::CUDADeviceContext,
-                          ops::SquareFunctor<plat::float16>>);
-REGISTER_OP_CUDA_KERNEL(
-    square_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
-                                           ops::SquareGradFunctor<float>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<double>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<int>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<int64_t>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<plat::float16>>);
+REGISTER_ACTIVATION_CUDA_KERNEL_INT(square, Square, CudaSquareFunctor,
+                                    CudaSquareGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     square_grad_grad,
@@ -135,7 +1512,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ==========================   pow register  ============================ */
-
 REGISTER_OP_CUDA_KERNEL(
     pow, ops::PowKernel<plat::CUDADeviceContext, ops::PowFunctor<float>>,
     ops::PowKernel<plat::CUDADeviceContext, ops::PowFunctor<double>>,
@@ -153,29 +1529,30 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ==========================   exp register  ============================ */
-
 REGISTER_OP_CUDA_KERNEL(
-    exp, ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<float>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<double>>,
+    exp, ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                                   ops::CudaExpFunctor<float>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<double>>,
     ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int>>,
     ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int64_t>>,
-    ops::ActivationKernel<plat::CUDADeviceContext,
-                          ops::ExpFunctor<plat::float16>>);
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<plat::float16>>);
 REGISTER_OP_CUDA_KERNEL(
-    exp_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
-                                        ops::ExpGradFunctor<float>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<double>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<int>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<int64_t>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<plat::float16>>);
+    exp_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                            ops::CudaExpGradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int64_t>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================  Log register ==================================*/
-REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
@@ -185,3 +1562,45 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LogDoubleGradKernel<plat::CUDADeviceContext,
                              ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
+
+#define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
+  __macro(sigmoid, Sigmoid, CudaSigmoidFunctor, CudaSigmoidGradFunctor);      \
+  __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
+  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
+          CudaLogSigmoidGradFunctor);                                         \
+  __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor);                  \
+  __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
+          CudaSoftShrinkGradFunctor);                                         \
+  __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor);                  \
+  __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor);               \
+  __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);                      \
+  __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);                      \
+  __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor);                  \
+  __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);                      \
+  __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor);                  \
+  __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor);                  \
+  __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor);                  \
+  __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor);               \
+  __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
+          CudaReciprocalGradFunctor);                                         \
+  __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor);              \
+  __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor);                  \
+  __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor);              \
+  __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor);              \
+  __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
+  __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
+  __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
+  __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor);  \
+  __macro(relu6, Relu6, CudaRelu6Functor, CudaRelu6GradFunctor);              \
+  __macro(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor,                     \
+          CudaTanhShrinkGradFunctor);                                         \
+  __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor,                     \
+          CudaHardShrinkGradFunctor);                                         \
+  __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor,                  \
+          CudaHardSigmoidGradFunctor);                                        \
+  __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor);              \
+  __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor,      \
+          CudaThresholdedReluGradFunctor);                                    \
+  __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
+          CudaHardSwishGradFunctor);
+FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index bc7def61b2e249d3fa5e8b1d915b86f50beaf77b..ccd5bf528ba58ca731513a1a1fafce3f2f64c470 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -258,6 +258,31 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
+// silu(x) = x / (1 + exp(-x))
+template <typename T>
+struct SiluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    out.device(d) = x * temp;
+  }
+};
+
+// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
+template <typename T>
+struct SiluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
+    auto temp2 = x * (-x).exp();                  // x*e^(-x)
+    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
+                           (static_cast<T>(1) + (temp2 / temp1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
 // For numerical stability, we can use the log-sum-exp trick:
 // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
@@ -366,6 +391,36 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
+template <typename T>
+struct TanhGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* Out,
+                  const framework::Tensor* ddX, const framework::Tensor* dOut,
+                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad"));
+    // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out
+    // * ddx)
+    if (dOutNew) {
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad"));
+      auto dout_new = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad"));
+      dout_new.device(*d) =
+          static_cast<T>(-1) * dout * static_cast<T>(2) * out * ddx;
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out * out) * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
 // tanhshrink(x) = x - tanh(x)
 // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
 template <typename T>
@@ -400,7 +455,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
+    out.device(d) = x * (temp1 || temp2).template cast<T>();
   }
 };
 
@@ -417,7 +472,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
+    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -1734,6 +1789,58 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   }
 }
 
+template <typename DeviceContext, typename Functor>
+class TanhDoubleGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor *Out, *ddX, *dOut;
+    framework::Tensor *dOutNew, *ddOut;
+    Out = ddX = dOut = nullptr;
+    dOutNew = ddOut = nullptr;
+
+    // extract ddx(input) and out(input)
+    auto ddx_var = ctx.InputVar("DDX");
+    auto out_var = ctx.InputVar("Out");
+    PADDLE_ENFORCE_NOT_NULL(
+        ddx_var, platform::errors::NotFound(
+                     "Cannot get input Variable ddx, variable name = %s",
+                     ctx.InputName("DDX")));
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var, platform::errors::NotFound(
+                     "Cannot get input Variable out, variable name = %s",
+                     ctx.InputName("Out")));
+    ddX = ctx.Input<framework::Tensor>("DDX");
+    Out = ctx.Input<framework::Tensor>("Out");
+
+    // set output ddout
+    auto ddout_var = ctx.OutputVar("DDOut");
+    if (ddout_var) {
+      ddOut = ctx.Output<framework::Tensor>("DDOut");
+    }
+
+    // extract dOut(intput)
+    auto dout_var = ctx.InputVar("DOut");
+    PADDLE_ENFORCE_NOT_NULL(
+        dout_var, platform::errors::NotFound(
+                      "Cannot get input Variable dout_var, variable name = %s",
+                      ctx.InputName("DOut")));
+    dOut = ctx.Input<framework::Tensor>("DOut");
+
+    // set output dout_new
+    auto dout_new_var = ctx.OutputVar("DOutNew");
+    if (dout_new_var) {
+      dOutNew = ctx.Output<framework::Tensor>("DOutNew");
+    }
+
+    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    auto& place = ctx.template device_context<DeviceContext>();
+    Functor functor;
+    functor(place, Out, ddX, dOut, dOutNew, ddOut);
+  }
+};
 template <typename DeviceContext, typename Functor>
 class SquareDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -2047,8 +2154,8 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
   __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
-  __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor);                          \
   __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
   __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f368c658230555c5a3529b39dfc1b60b1cab56e4
--- /dev/null
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -0,0 +1,367 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class PowNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto factor = ctx.Attr<float>("factor");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Power", {*x}, {*out},
+                              {{"power", factor},
+                               {"scale", static_cast<float>(1.0)},
+                               {"shift", static_cast<float>(0.0)}});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PowGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto factor = ctx.Attr<float>("factor");
+
+    auto x_dims = x->dims();
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(liym27): dx = dout * factor * x.pow(factor-1)
+
+    // Step1: Compute x_pow = x.pow(factor-1)
+    Tensor x_pow(x->type());
+    x_pow.mutable_data<T>(x->dims(), place);
+    auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow},
+                                  {{"power", factor - static_cast<float>(1)}});
+    runner_pow.Run(stream);
+
+    // Step 2: Construct a broadcast factor, which has the same shape with x.
+
+    // 2.1 Get a factor tensor with shape [1].
+    Tensor factor_tensor(framework::proto::VarType::FP32);
+    factor_tensor.mutable_data<float>({1}, place);
+    FillNpuTensorWithConstant<float>(&factor_tensor, factor);
+
+    // 2.2 Get the factor which has the shape with x and the same value with
+    // factor.
+    Tensor factor_bc_tensor(framework::proto::VarType::FP32);
+    factor_bc_tensor.mutable_data<float>(x_dims, place);
+    auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
+                                 {{"dims", framework::vectorize(x_dims)}});
+    runner_bc.Run(stream);
+
+    // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
+    Tensor x_power_mul_factor(x->type());
+    x_power_mul_factor.mutable_data<T>(x->dims(), place);
+    auto runner_mul_1 =
+        NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
+    runner_mul_1.Run(stream);
+
+    // Step 4: Compute dx = dout * factor * x.pow(factor-1)
+    dx->mutable_data<T>(place);
+    auto runner_mul_2 =
+        NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
+    runner_mul_2.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReluNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Relu",
+                              {
+                                  *x,
+                              },
+                              {*out}, {});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReluGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    dx->mutable_data<T>(ctx.GetPlace());
+    auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
+
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SqrtNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SqrtGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
+    dx_runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor one(x->type());
+    one.mutable_data<T>(x->dims(), place);
+    auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {});
+    one_runner.Run(stream);
+
+    Tensor sub(x->type());
+    sub.mutable_data<T>(x->dims(), place);
+    auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {});
+    sub_runner.Run(stream);
+
+    auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {});
+    out_runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LogGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TanhNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TanhGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<Tensor>("Out");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
+    dx_runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SquareNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Square", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    pow, ops::PowNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::PowNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    pow_grad, ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::PowGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    relu, ops::ReluNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReluNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    relu_grad,
+    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    sqrt, ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    sqrt_grad,
+    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SqrtGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    log, ops::LogNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LogNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    log_grad, ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LogGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    tanh, ops::TanhNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TanhNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    tanh_grad,
+    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TanhGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    square, ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext,
+                         paddle::platform::float16>,
+    ops::SquareNPUKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h
index 97e3ed9c1adda0082a7ce74cfb5d9b6cb78dde63..ecfd10d2fa6fbdbfa37bfd4f3597b8fbf0a0c7c7 100644
--- a/paddle/fluid/operators/addmm_op.h
+++ b/paddle/fluid/operators/addmm_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -32,8 +33,8 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-using Array1 = Eigen::DSizes<int64_t, 1>;
-using Array2 = Eigen::DSizes<int64_t, 2>;
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
 
 using Tensor = framework::Tensor;
 
@@ -105,7 +106,8 @@ class AddMMKernel : public framework::OpKernel<T> {
     auto eigen_out = EigenTensor<T, 2>::From(*out);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(place) = eigen_input.broadcast(bcast_dims);
+    EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
+        place, eigen_out, eigen_input, bcast_dims);
 
     blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha,
               x->data<T>(), x_dims[1], y->data<T>(), y_dims[1], beta,
diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt
index b3ff52a7ae119ded7a305c97f3365d1a72d50acf..2ea8bbcbc61df84eb445b1a512653d66f600c46a 100644
--- a/paddle/fluid/operators/amp/CMakeLists.txt
+++ b/paddle/fluid/operators/amp/CMakeLists.txt
@@ -4,3 +4,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 register_operators()
+
+if(WITH_ASCEND_CL)
+    cc_test(check_finite_and_unscale_op_npu_test SRCS check_finite_and_unscale_op_npu_test.cc DEPS op_registry check_finite_and_unscale_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..181dd6eabe22d7d0c82b7c8f17625d787008f00b
--- /dev/null
+++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class AllocFloatStatusOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("FloatStatus"), "Output", "FloatStatus",
+                   "alloc_float_status");
+    ctx->SetOutputDim("FloatStatus", {8});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+class AllocFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("FloatStatus",
+              "(Tensor) of shape {8} that holds the float status.");
+    AddComment(R"DOC(
+      Produces a float Tensor that holds the float status
+)DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AllocFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Operator alloc_float_status is not supported on CPU"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    alloc_float_status, ops::AllocFloatStatusOp, ops::AllocFloatStatusMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(alloc_float_status,
+                       ops::AllocFloatStatusKernel<CPU, float>);
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fe5b08af52a624b29100635ee34cfac7c2d2a859
--- /dev/null
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class AllocFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
+    float_status->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    alloc_float_status,
+    ops::AllocFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
index 9d78936ad5f7f2618eb766d84de2c631fc0cf8c5..c7520dbd34f6a92afb5c2fe320197fdad8e95379 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -60,6 +60,12 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Scale",
              "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
              "operator.");
+#ifdef PADDLE_WITH_ASCEND_CL
+    AddInput("FloatStatus",
+             "(Tensor) 1-dim tensor of shape [8], allocated by "
+             "alloc_float_status op")
+        .AsDispensable();
+#endif
     AddOutput("Out",
               "(Tensors) The scaled output tensor of "
               "check_finite_and_unscale operator.")
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index 6840e4847c4c6485c2815e0634bcd7aaa16783b4..c699486a9140a388dc79359cf3cc40fc61e4f45b 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -26,18 +26,51 @@ __global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) {
 }
 
 template <typename T, typename MT>
-__global__ void CheckFiniteAndUnscale(const T* in, const MT* scale, int num,
-                                      bool* found_inf, T* out) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (idx < num) {
-    MT val = static_cast<MT>(in[idx]) * (*scale);
+__global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale,
+                                      int64_t size, int64_t* starts,
+                                      bool* found_inf, T** outs) {
+  const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // copy starts array from global memory to shared memory
+  extern __shared__ int64_t s_starts[];
+  for (int i = threadIdx.x; i <= size; i += blockDim.x) {
+    s_starts[i] = starts[i];
+  }
+  __syncthreads();
+
+  const int64_t num = s_starts[size];
+  int xs_index = 0;
+  bool local_found_inf = false;
+  const MT local_scale = *scale;
+  for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) {
+    // get the "out" index of "id"
+    // For example:
+    // idx = 15, starts = [0, 10, 10, 20, 30]
+    // because 10 <= idx < 20 ==>
+    // the idx element locate in the 3rd tensor (notice the 2nd tensor size is
+    // 0)
+    int next_xs_index = xs_index;
+    while (idx >= s_starts[next_xs_index]) next_xs_index++;
+    xs_index = next_xs_index - 1;
+
+    // get in data and out data
+    const T* in = xs[xs_index];
+    T* out = outs[xs_index];
+    int64_t in_idx = idx - s_starts[xs_index];
+
+    // Unscale
+    MT val = static_cast<MT>(in[in_idx]) * local_scale;
     T narrow_val = static_cast<T>(val);
-    out[idx] = narrow_val;
+    out[in_idx] = narrow_val;
+
+    // CheckFinite
     if (!isfinite(narrow_val)) {
-      *found_inf = true;
+      local_found_inf = true;
     }
   }
+  if (local_found_inf) {
+    *found_inf = true;
+  }
 }
 
 template <typename T>
@@ -63,20 +96,57 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
     InverseAndMemset<MPDType><<<1, 1, 0, dev_ctx.stream()>>>(
         scale_data, inverse_scale_v, found_inf_data);
 
-    for (size_t i = 0; i < xs.size(); ++i) {
-      const auto* x = xs[i];
-      auto* out = outs[i];
-      const T* x_data = x->data<T>();
-      T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-
-      int num = x->numel();
-      int block = 1024;
-      int grid = (num + block - 1) / block;
-      VLOG(3) << "launch kernel";
-      CheckFiniteAndUnscale<T, MPDType><<<grid, block, 0, dev_ctx.stream()>>>(
-          x_data, inverse_scale_v, num, found_inf_data, out_data);
-      VLOG(3) << "finish kernel";
+    size_t xs_size = xs.size();
+    const auto& cpu_place = platform::CPUPlace();
+    // calculate each tensor's start index and copy to device
+    auto h_starts_tensor =
+        memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+    int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
+
+    auto d_starts_tensor =
+        memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
+    int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
+
+    // the start index value of each tensor is
+    // the sum of previous tensor's size. For example:
+    // xs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
+    h_starts[0] = 0;
+    for (int i = 1; i <= xs_size; i++) {
+      h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
+    }
+    int64_t total_num = h_starts[xs_size];
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
+                 dev_ctx.stream());
+
+    // copy each tensor's data address to device
+    auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
+    const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
+    T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
+
+    auto d_mem = memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*));
+    const T** d_xs = reinterpret_cast<const T**>(d_mem->ptr());
+    T** d_outs = reinterpret_cast<T**>(d_mem->ptr()) + xs_size;
+
+    for (size_t i = 0; i < xs_size; ++i) {
+      h_xs[i] = xs[i]->data<T>();
+      h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
     }
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs,
+                 cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream());
+
+    // Launch Kernel
+    int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
+    int elements_per_block =
+        threads_per_block * 20;  // each thread deal with 20 number
+    int blocks_per_grid =
+        (total_num + elements_per_block - 1) / elements_per_block;
+    VLOG(3) << "launch kernel";
+    CheckFiniteAndUnscale<
+        T, MPDType><<<blocks_per_grid, threads_per_block,
+                      (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
+        d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs);
+    VLOG(3) << "finish kernel";
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8fd45326e4ec6134cf4b98be12212ce8d7d74541
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
+// On NPU, we do not really check the data of input tensors,
+// but use NPUGetFloatStatus to check whether the nan/inf occurs on device,
+// and clear it after this op.
+// Which may leads to wrong result if the input tensors is not calculated
+// on NPU device, but got from other way, for example, feeding.
+template <typename T>
+class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
+
+    found_inf->mutable_data<bool>(ctx.GetPlace());
+
+    bool found_inf_data = false;
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // step1: inverse scale(RealDiv)
+    Tensor const_tensor;
+    const_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
+
+    // Inverse(1.0/scale)
+    Tensor* tmp_inverse_out = const_cast<Tensor*>(scale);
+    Tensor inverse_out(scale->type());
+    inverse_out.Resize(scale->dims());
+    inverse_out.mutable_data<T>(ctx.GetPlace());
+    auto runner_inverse =
+        NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
+    runner_inverse.Run(stream);
+    tmp_inverse_out = &inverse_out;
+
+    // NOTE(zhiqiu):
+    Tensor tmp;
+    tmp.mutable_data<float>({8}, ctx.GetPlace());
+
+    // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
+    // tmp is only placeholder.
+    auto runner_float_status =
+        NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp},
+                    {{"message", std::string("check_nan_and_inf")}});
+    runner_float_status.Run(stream);
+
+    Tensor sum;
+    sum.mutable_data<float>({1}, ctx.GetPlace());
+    auto runner_reduce_sum =
+        NpuOpRunner("ReduceSumD", {*float_status}, {sum},
+                    {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
+    runner_reduce_sum.Run(stream);
+
+    std::vector<float> sum_vec;
+    TensorToVector(
+        sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
+        &sum_vec);
+    found_inf_data = (sum_vec[0] > 1);
+
+    VLOG(4) << "found_inf_data:" << found_inf_data;
+
+    for (size_t i = 0; i < xs.size(); ++i) {
+      const auto* x = xs[i];
+      auto* out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+      if (!found_inf_data) {
+        // MatMul
+        auto runner_matmul =
+            NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
+        runner_matmul.Run(stream);
+      }
+    }
+
+    // set found_inf to true
+    VLOG(4) << "found overflow:" << found_inf_data;
+    Tensor found_inf_tensor;
+    found_inf_tensor.Resize({1});
+    bool* is_found_inf =
+        found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
+    *is_found_inf = found_inf_data;
+
+    framework::TensorCopy(
+        found_inf_tensor, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), found_inf);
+    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+
+    auto runner_clear_status =
+        NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
+    runner_clear_status.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(check_finite_and_unscale,
+                       ops::CheckFiniteAndUnscaleNPUKernel<float>,
+                       ops::CheckFiniteAndUnscaleNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a80b83f0cbe51fe536955b047d7be1b4c451a5a9
--- /dev/null
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+#include <random>
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+using Tensor = paddle::framework::Tensor;
+
+USE_OP(check_finite_and_unscale);
+USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU);
+
+struct InputVars {
+  std::string name;
+  f::LoDTensor *tensor;
+};
+
+template <typename T>
+void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
+  const f::DDim dims = f::make_ddim({2, 2});
+  auto place = ctx.GetPlace();
+
+  // init input
+  std::vector<InputVars> input_names = {
+      {"x", scope->Var("x")->GetMutable<f::LoDTensor>()},
+      {"x1", scope->Var("x1")->GetMutable<f::LoDTensor>()}};
+
+  auto *scale = scope->Var("scale")->GetMutable<f::LoDTensor>();
+
+  // init output
+  auto *out = scope->Var("out")->GetMutable<f::LoDTensor>();
+  auto *out1 = scope->Var("out1")->GetMutable<f::LoDTensor>();
+  auto *found_inf = scope->Var("found_inf")->GetMutable<f::LoDTensor>();
+
+  // Initialize input data
+  const int num_inputs = input_names.size();
+  size_t numel = static_cast<size_t>(f::product(dims));
+
+  for (int i = 0; i < num_inputs; ++i) {
+    std::vector<T> init_xs;
+    for (size_t j = 0; j < numel; ++j) {
+      if (j == 0) {
+        init_xs.push_back(static_cast<T>(NAN));
+      } else {
+        init_xs.push_back(static_cast<T>(j + 1));
+      }
+    }
+    f::TensorFromVector(init_xs, ctx, input_names[i].tensor);
+    input_names[i].tensor->Resize(dims);
+  }
+
+  f::TensorFromVector(std::vector<T>{static_cast<T>(0.5)}, ctx, scale);
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(
+      "check_finite_and_unscale", {{"X", {"x", "x1"}}, {"Scale", {"scale"}}},
+      {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}}, attrs);
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  // out0
+  std::vector<T> out_vec;
+  f::TensorToVector(*out, ctx, &out_vec);
+  EXPECT_EQ(out_vec.size(), static_cast<size_t>(4));
+  for (size_t j = 0; j < out_vec.size(); ++j) {
+    VLOG(3) << "out_vec[" << j << "]:" << out_vec[j];
+  }
+
+  ctx.Wait();
+
+  // out0
+  std::vector<T> out1_vec;
+  f::TensorToVector(*out1, ctx, &out1_vec);
+  EXPECT_EQ(out1_vec.size(), static_cast<size_t>(4));
+  for (size_t j = 0; j < out1_vec.size(); ++j) {
+    VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j];
+  }
+
+  ctx.Wait();
+
+  // out found_inf
+  Tensor found_inf_tensor;
+  found_inf_tensor.Resize({1});
+  bool *found_inf_data =
+      found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
+  f::TensorCopy(*found_inf, place, &found_inf_tensor);
+  EXPECT_TRUE(*found_inf_data);
+
+  ctx.Wait();
+}
+
+TEST(check_finite_and_unscale, NPU_fp32) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
+
+TEST(check_finite_and_unscale, NPU_fp16) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
index b48b0e78892933bc76894611d0ae6d01c194d036..de1f83c1ee50d00960c50638fab5fd6cffca1a36 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -34,13 +34,39 @@ __global__ void GpuUpdateLossScaling(
 }
 
 template <typename T>
-__global__ void FillIf(T* data, const int64_t num, const T value,
-                       const bool* has_inf) {
-  if (*has_inf) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    for (int i = tid; i < num; i += blockDim.x * gridDim.x) {
-      data[i] = value;
-    }
+__global__ void FusedFillIf(T** outs, const size_t xs_size,
+                            const int64_t* starts, const T value,
+                            const bool* has_inf) {
+  if (!(*has_inf)) return;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // copy starts array from global memory to shared memory
+  extern __shared__ int64_t s_starts[];
+  for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) {
+    s_starts[i] = starts[i];
+  }
+  __syncthreads();
+
+  const int64_t total_num = s_starts[xs_size];
+  int out_index = 0;
+
+  for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) {
+    // get the "out" index of "id"
+    // For example:
+    // id = 15, starts = [0, 10, 10, 20, 30]
+    // because 10 <= id < 20 ==>
+    // the id element locate in the 3rd tensor (notice the 2nd tensor size is 0)
+    int next_out_index = out_index;
+    while (id >= s_starts[next_out_index]) next_out_index++;
+    out_index = next_out_index - 1;
+
+    // get data pointer and index
+    T* out_data = outs[out_index];
+    int64_t idx = id - s_starts[out_index];
+
+    // set value
+    out_data[idx] = value;
   }
 }
 
@@ -68,15 +94,52 @@ class LazyZeros<platform::CUDADeviceContext, T> {
                   const bool* found_inf_data,
                   const std::vector<const framework::Tensor*>& xs,
                   const std::vector<framework::Tensor*>& outs) const {
-    for (size_t i = 0; i < xs.size(); ++i) {
-      auto* out = outs[i];
-      T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-      int64_t num = out->numel();
-      int block = 1024;
-      int grid = (block - 1 + num) / block;
-      FillIf<<<grid, block, 0, dev_ctx.stream()>>>(
-          out_data, num, static_cast<T>(0), found_inf_data);
+    size_t xs_size = xs.size();
+    const auto& cpu_place = platform::CPUPlace();
+    // alloc each tensor's start index and copy to device
+    auto h_in_starts_mem =
+        memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+    int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
+
+    auto d_in_starts_mem =
+        memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
+    int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());
+
+    // the start index value of each tensor is
+    // the sum of previous tensor's size. For example:
+    // outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
+    h_starts[0] = 0;
+    for (int i = 0; i < xs_size; i++) {
+      h_starts[i + 1] = h_starts[i] + outs[i]->numel();
     }
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
+                 dev_ctx.stream());
+
+    // copy each tensor of "outs" data address array to device
+    auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*));
+    T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
+
+    auto d_out_addrs_mem = memory::Alloc(dev_ctx, xs_size * sizeof(T*));
+    T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr());
+
+    for (size_t i = 0; i < xs_size; ++i) {
+      h_out_addrs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
+    }
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*),
+                 dev_ctx.stream());
+
+    // launch cuda kernel
+    int64_t total_num = h_starts[xs_size];
+    int64_t threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
+    int64_t elements_per_block =
+        threads_per_block * 50;  // each thread deal with 50 data
+    int64_t blocks_per_grid =
+        (total_num + elements_per_block - 1) / elements_per_block;
+    FusedFillIf<T><<<blocks_per_grid, threads_per_block,
+                     (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
+        d_out_addrs, xs_size, d_starts, static_cast<T>(0), found_inf_data);
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..820966addfcff31d1676aedd71101a2e3c5a4332
--- /dev/null
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -0,0 +1,247 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/amp/update_loss_scaling_op.h"
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+void Update(const platform::NPUDeviceContext& ctx,
+            const std::vector<bool> found_inf_vec,
+            const Tensor* pre_loss_scaling_tensor, const Tensor* good_in_tensor,
+            const Tensor* bad_in_tensor, const int incr_every_n_steps,
+            const int decr_every_n_nan_or_inf, const float incr_ratio,
+            const float decr_ratio, Tensor* updated_loss_scaling_tensor,
+            Tensor* good_out_tensor, Tensor* bad_out_tensor) {
+  auto place = ctx.GetPlace();
+  auto stream = ctx.stream();
+  if (found_inf_vec[0]) {
+    // good_out_data = 0
+    auto g = good_out_tensor->mutable_data<int>(place);
+    platform::NPUMemsetAsync(static_cast<void*>(g), 0,
+                             good_out_tensor->numel() * sizeof(int), stream);
+    // bad_out_data = bad_in_data + 1
+    Tensor factor_tensor(bad_out_tensor->type());
+    factor_tensor.mutable_data<int>({1}, place);
+    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
+    auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
+                                 {*bad_out_tensor}, {});
+    runner_p2.Run(stream);
+
+    std::vector<int> bad_out_data;
+    TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
+    if (bad_out_data[0] == decr_every_n_nan_or_inf) {
+      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                   {*updated_loss_scaling_tensor},
+                                   {{"power", static_cast<float>(1)},
+                                    {"scale", decr_ratio},
+                                    {"shift", static_cast<float>(0)}});
+
+      runner_p3.Run(stream);
+
+      std::vector<T> new_loss_scaling;
+      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
+      if (new_loss_scaling[0] < static_cast<T>(1)) {
+        // updated_loss_scaling_data = 1
+        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                     {*updated_loss_scaling_tensor},
+                                     {{"power", static_cast<float>(1)},
+                                      {"scale", static_cast<float>(0)},
+                                      {"shift", static_cast<float>(1)}});
+
+        runner_p4.Run(stream);
+      }
+
+      // bad_out_data = 0
+      auto b = bad_out_tensor->mutable_data<int>(place);
+      platform::NPUMemsetAsync(static_cast<void*>(b), 0,
+                               bad_out_tensor->numel() * sizeof(int), stream);
+    }
+  } else {
+    // bad_out_data = 0
+    auto b = bad_out_tensor->mutable_data<int>(place);
+    platform::NPUMemsetAsync(static_cast<void*>(b), 0,
+                             bad_out_tensor->numel() * sizeof(int), stream);
+
+    // good_out_data = good_in_data + 1
+    Tensor factor_tensor(good_out_tensor->type());
+    factor_tensor.mutable_data<int>({1}, place);
+    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
+    auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
+                                 {*good_out_tensor}, {});
+    runner_p2.Run(stream);
+
+    std::vector<int> good_out_data;
+    TensorToVector(*good_out_tensor, ctx, &good_out_data);
+
+    if (good_out_data[0] == incr_every_n_steps) {
+      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                   {*updated_loss_scaling_tensor},
+                                   {{"power", static_cast<float>(1)},
+                                    {"scale", incr_ratio},
+                                    {"shift", static_cast<float>(0)}});
+      runner_p3.Run(stream);
+
+      std::vector<T> new_loss_scaling;
+      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
+      if (!std::isfinite(new_loss_scaling[0])) {
+        // updated_loss_scaling_data = pre_loss_scaling_data
+        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                     {*updated_loss_scaling_tensor},
+                                     {{"power", static_cast<float>(1)},
+                                      {"scale", static_cast<float>(1)},
+                                      {"shift", static_cast<float>(0)}});
+
+        runner_p4.Run(stream);
+      }
+      // good_out_data = 0
+      auto g = good_out_tensor->mutable_data<int>(place);
+      platform::NPUMemsetAsync(static_cast<void*>(g), 0,
+                               good_out_tensor->numel() * sizeof(int), stream);
+    }
+  }
+}
+
+template <typename T>
+class UpdateLossScalingFunctor<platform::NPUDeviceContext, T> {
+ public:
+  void operator()(const platform::NPUDeviceContext& dev_ctx,
+                  const std::vector<bool> found_inf_vec,
+                  const Tensor* pre_loss_scaling_tensor,
+                  const Tensor* good_in_tensor, const Tensor* bad_in_tensor,
+                  const int incr_every_n_steps,
+                  const int decr_every_n_nan_or_inf, const float incr_ratio,
+                  const float decr_ratio, Tensor* updated_loss_scaling_tensor,
+                  Tensor* good_out_tensor, Tensor* bad_out_tensor) const {
+    Update<T>(dev_ctx, found_inf_vec, pre_loss_scaling_tensor, good_in_tensor,
+              bad_in_tensor, incr_every_n_steps, decr_every_n_nan_or_inf,
+              incr_ratio, decr_ratio, updated_loss_scaling_tensor,
+              good_out_tensor, bad_out_tensor);
+  }
+};
+
+template <typename T>
+class LazyZerosNPU {
+ public:
+  void operator()(const platform::NPUDeviceContext& dev_ctx,
+                  const std::vector<bool> found_inf_vec,
+                  const std::vector<const framework::Tensor*>& xs,
+                  const std::vector<framework::Tensor*>& outs) const {
+    if (!xs.size()) {
+      return;
+    }
+    auto place = dev_ctx.GetPlace();
+    auto stream = dev_ctx.stream();
+    Tensor* zero_tensor;
+    void* zero_ptr;
+    if (found_inf_vec[0]) {
+      int max_num = -1;
+      for (size_t i = 0; i < xs.size(); ++i) {
+        auto* out = outs[i];
+        int num = out->numel();
+        if (max_num < num) {
+          max_num = num;
+          zero_tensor = out;
+        }
+      }
+
+      zero_tensor->mutable_data<T>(place);
+      auto runner_zeros =
+          NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
+      runner_zeros.Run(stream);
+      zero_tensor->check_memory_size();
+      zero_ptr = zero_tensor->data<void>();
+    }
+
+    for (size_t i = 0; i < xs.size(); ++i) {
+      auto* out = outs[i];
+      auto* x = xs[i];
+      auto dst_ptr = out->mutable_data<T>(place);
+      if (!found_inf_vec[0]) {
+        framework::TensorCopy(*x, place, dev_ctx, out);
+      } else if (zero_ptr != dst_ptr) {
+        auto size = out->numel() * framework::SizeOfType(out->type());
+        memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), dst_ptr,
+                     BOOST_GET_CONST(platform::NPUPlace, place), zero_ptr, size,
+                     stream);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class UpdateLossScalingNPUKernel : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    const auto xs = ctx.MultiInput<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    const auto* found_inf = ctx.Input<Tensor>("FoundInfinite");
+    PADDLE_ENFORCE_EQ(found_inf->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "FoundInfinite must has only one element."));
+
+    std::vector<bool> found_inf_vec;
+    TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec);
+
+    LazyZerosNPU<T>{}(dev_ctx, found_inf_vec, xs, outs);
+    const bool stop_update = ctx.Attr<bool>("stop_update");
+    if (stop_update) {
+      return;
+    }
+
+    const auto* pre_loss_scaling = ctx.Input<Tensor>("PrevLossScaling");
+    const auto* good_in = ctx.Input<Tensor>("InGoodSteps");
+    const auto* bad_in = ctx.Input<Tensor>("InBadSteps");
+    auto* updated_loss_scaling = ctx.Output<Tensor>("LossScaling");
+    auto* good_out = ctx.Output<Tensor>("OutGoodSteps");
+    auto* bad_out = ctx.Output<Tensor>("OutBadSteps");
+
+    updated_loss_scaling->mutable_data<MPDType>(dev_ctx.GetPlace());
+    good_out->mutable_data<int>(dev_ctx.GetPlace());
+    bad_out->mutable_data<int>(dev_ctx.GetPlace());
+
+    const int incr_every_n_steps = ctx.Attr<int>("incr_every_n_steps");
+    const int decr_every_n_nan_or_inf =
+        ctx.Attr<int>("decr_every_n_nan_or_inf");
+    const float incr_ratio = ctx.Attr<float>("incr_ratio");
+    const float decr_ratio = ctx.Attr<float>("decr_ratio");
+    UpdateLossScalingFunctor<DeviceContext, MPDType>{}(
+        dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in,
+        incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio,
+        updated_loss_scaling, good_out, bad_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    update_loss_scaling,
+    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::UpdateLossScalingNPUKernel<paddle::platform::NPUDeviceContext,
+                                    double>);
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index add533bafcb0a7f20c76f0844fb609d7af719bb1..433cabcfee0104a1112baa4aca6c18d072d8f696 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -162,6 +162,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
                                ops::AssignKernel, int, ops::AssignKernel,
                                int64_t, ops::AssignKernel, bool,
                                ops::AssignKernel, plat::float16,
+                               ops::AssignKernel, plat::bfloat16,
                                ops::AssignKernel);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..93689d5e495f33484d2f05b04d25734a8c5ab07e
--- /dev/null
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/operators/assign_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+namespace platform {
+struct CPUPlace;
+struct CUDAPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class AssignNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    assign, ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AssignNPUKernel<paddle::platform::NPUDeviceContext, double>)
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..792d01a5efe43034c201a57641cf3dc1b4c38e4c
--- /dev/null
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(assign);
+USE_OP_DEVICE_KERNEL(assign, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  init.push_back(static_cast<T>(1.0));
+  init.push_back(static_cast<T>(2.0));
+  init.push_back(static_cast<T>(3.0));
+  init.push_back(static_cast<T>(4.0));
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({4});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  auto op =
+      f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {});
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
+  EXPECT_EQ(out_vec[0], static_cast<T>(1.0));
+  EXPECT_EQ(out_vec[1], static_cast<T>(2.0));
+  EXPECT_EQ(out_vec[2], static_cast<T>(3.0));
+  EXPECT_EQ(out_vec[3], static_cast<T>(4.0));
+}
+
+TEST(assign, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "assign");
+}
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index fc31885824b55f22bba77559d728a1e40d47e784..edad20435b41c9eb59c3df793c00ab3bfe96771b 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -575,7 +575,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     // SavedVariance have been reverted in forward operator
     const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const bool is_test = ctx.Attr<bool>("is_test");
     const float epsilon = ctx.Attr<float>("epsilon");
     const DataLayout data_layout =
@@ -585,6 +585,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
+    use_global_stats = is_test || use_global_stats;
+
     // batch_norm with inplace as false will take X as grad input, which
     // is same as cuDNN batch_norm backward calculation, batch_norm
     // with inplace as true only take Y as input and X should be calculate
@@ -605,13 +607,6 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                             "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
     }
 
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 444c24b826b1b89175136e650aab15ad7b8b2881..6fc78732b1063af04a34de5d690a4f2ed75978f2 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -41,6 +41,83 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
+template <typename T, framework::DataLayout layout>
+static __global__ void BNForwardInference(
+    const T *x, const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
+    const double epsilon, T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
+    const T *x, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
+    const double epsilon, double exponentialAverageFactor, T *y,
+    BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
 template <typename T>
 class BatchNormKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -80,8 +157,12 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     auto dtype = platform::CudnnDataType<T>::type;
 
 #ifdef PADDLE_WITH_HIP
-    // HIP do not support compute format of NHWC
-    auto compute_format = DataLayout::kNCHW;
+    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
+                                                           : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
 #else
     const bool fast_nhwc_batch_norm =
         test_mode ||
@@ -111,14 +192,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 
 // ------------------- cudnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t bn_param_desc_;
-    miopenBatchNormMode_t mode_;
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
@@ -138,7 +220,8 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
 #ifdef PADDLE_WITH_HIP
-    mode_ = miopenBNSpatial;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
     if (FLAGS_cudnn_batchnorm_spatial_persistent) {
       mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -161,14 +244,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     }
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// Note: PERSISTENT not implemented for inference
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(
+//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
 #else
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
@@ -226,28 +310,53 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
               C, est_var->dims()[0], est_var->dims()));
 
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenBatchNormalizationForwardInference(
-              handle, miopenBNSpatial,
-              const_cast<void *>(
-                  static_cast<const void *>(CudnnDataType<T>::kOne())),
-              const_cast<void *>(
-                  static_cast<const void *>(CudnnDataType<T>::kZero())),
-              data_desc_,
-              static_cast<const void *>(transformed_x.template data<T>()),
-              data_desc_,
-              static_cast<void *>(
-                  transformed_y.template mutable_data<T>(ctx.GetPlace())),
-              bn_param_desc_,
-              const_cast<void *>(static_cast<const void *>(
-                  scale->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  bias->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  est_mean->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  est_var->template data<BatchNormParamType<T>>())),
-              epsilon));
+      const int block_size = 256;
+      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
+      if (compute_format == DataLayout::kNCHW) {
+        BNForwardInference<
+            T,
+            DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+            epsilon, transformed_y.template data<T>());
+      } else {
+        BNForwardInference<
+            T,
+            DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+            epsilon, transformed_y.template data<T>());
+      }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardInference(
+//         handle, miopenBNSpatial,
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_mean->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_var->template data<BatchNormParamType<T>>())),
+//         epsilon));
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnBatchNormalizationForwardInference(
@@ -365,34 +474,66 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
         if (!called) {
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::miopenBatchNormalizationForwardTraining(
-                  handle, mode_, const_cast<void *>(static_cast<const void *>(
-                                     CudnnDataType<T>::kOne())),
-                  const_cast<void *>(
-                      static_cast<const void *>(CudnnDataType<T>::kZero())),
-                  data_desc_,
-                  static_cast<const void *>(transformed_x.template data<T>()),
-                  data_desc_,
-                  static_cast<void *>(
-                      transformed_y.template mutable_data<T>(ctx.GetPlace())),
-                  bn_param_desc_,
-                  const_cast<void *>(static_cast<const void *>(
-                      scale->template data<BatchNormParamType<T>>())),
-                  const_cast<void *>(static_cast<const void *>(
-                      bias->template data<BatchNormParamType<T>>())),
-                  this_factor,
-                  static_cast<void *>(
-                      mean_out->template mutable_data<BatchNormParamType<T>>(
-                          ctx.GetPlace())),
-                  static_cast<void *>(variance_out->template mutable_data<
-                                      BatchNormParamType<T>>(ctx.GetPlace())),
-                  epsilon,
-                  static_cast<void *>(
-                      saved_mean->template mutable_data<BatchNormParamType<T>>(
-                          ctx.GetPlace())),
-                  static_cast<void *>(saved_variance->template mutable_data<
-                                      BatchNormParamType<T>>(ctx.GetPlace()))));
+          const int num = transformed_x.numel();
+          const int block = 256;
+          const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+          const int max_blocks = std::max(max_threads / block, 1);
+          const int grid = std::min(C, max_blocks);
+          if (compute_format == DataLayout::kNCHW) {
+            BNForwardTraining<
+                T, block,
+                DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(),
+                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+                epsilon, this_factor, transformed_y.template data<T>(),
+                mean_out->template data<BatchNormParamType<T>>(),
+                variance_out->template data<BatchNormParamType<T>>(),
+                saved_mean->template data<BatchNormParamType<T>>(),
+                saved_variance->template data<BatchNormParamType<T>>());
+          } else {
+            BNForwardTraining<
+                T, block,
+                DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(),
+                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+                epsilon, this_factor, transformed_y.template data<T>(),
+                mean_out->template data<BatchNormParamType<T>>(),
+                variance_out->template data<BatchNormParamType<T>>(),
+                saved_mean->template data<BatchNormParamType<T>>(),
+                saved_variance->template data<BatchNormParamType<T>>());
+          }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               platform::dynload::cudnnBatchNormalizationForwardTraining(
@@ -423,11 +564,12 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           ctx, &transformed_y, y);
     }
 #ifdef PADDLE_WITH_HIP
-    // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -439,7 +581,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void KeBNBackwardScaleBias(
+static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
     const T *dy, const T *x, const BatchNormParamType<T> *mean,
     const BatchNormParamType<T> *variance, const double epsilon, const int N,
     const int C, const int HxW, BatchNormParamType<T> *dscale,
@@ -526,13 +668,97 @@ class InplaceHelper {
 };
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void BNBackwardData(const T *dy,
-                                      const BatchNormParamType<T> *scale,
-                                      const BatchNormParamType<T> *mean,
-                                      const T *x,
-                                      const BatchNormParamType<T> *variance,
-                                      const int C, const int N, const int HxW,
-                                      T *dx) {
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
+    const T *dy, const T *x, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *saved_mean,
+    const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
+    const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> dscale_val;
+  __shared__ BatchNormParamType<T> dbias_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    if (saved_mean && saved_inv_variance) {
+      if (threadIdx.x == 0) {
+        inv_var_val = saved_inv_variance[i];
+        mean_val = saved_mean[i];
+      }
+    } else {
+      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+      BatchNormParamType<T> x_square_sum =
+          static_cast<BatchNormParamType<T>>(0);
+
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index = layout == framework::DataLayout::kNCHW
+                              ? (j / HxW * C + i) * HxW + j % HxW
+                              : j * outer_size + i;
+        BatchNormParamType<T> x_i =
+            static_cast<BatchNormParamType<T>>(x[index]);
+        x_sum += x_i;
+        x_square_sum += x_i * x_i;
+      }
+      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+      x_square_sum =
+          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+      if (threadIdx.x == 0) {
+        mean_val = x_sum / inner_size;
+        inv_var_val =
+            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
+      }
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      ds_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
+      db_sum += dy_i;
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale_val = ds_sum * inv_var_val;
+      dbias_val = db_sum;
+      dscale[i] = dscale_val;
+      dbias[i] = dbias_val;
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] = scale[i] * inv_var_val *
+                  (static_cast<BatchNormParamType<T>>(dy[index]) -
+                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
+                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
+                       inv_var_val * dscale_val / inner_size);
+    }
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
+    const T *dy, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *mean, const T *x,
+    const BatchNormParamType<T> *variance, const int C, const int N,
+    const int HxW, T *dx) {
   const int outer_size = C;
   const int inner_size = N * HxW;
   typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
@@ -567,7 +793,6 @@ static __global__ void BNBackwardData(const T *dy,
       dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
     }
     __syncthreads();
-
     for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
       const int index = layout == framework::DataLayout::kNCHW
                             ? (j / HxW * C + i) * HxW + j % HxW
@@ -592,7 +817,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
         platform::errors::InvalidArgument("It must use CUDAPlace."));
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
 
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
@@ -625,12 +850,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     }
 
     const bool is_test = ctx.Attr<bool>("is_test");
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
+    use_global_stats = is_test || use_global_stats;
 
     const auto &x_dims = x->dims();
 
@@ -668,8 +888,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     auto dtype = platform::CudnnDataType<T>::type;
     const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
 #ifdef PADDLE_WITH_HIP
-    // HIP do not support compute format of NHWC
-    auto compute_format = DataLayout::kNCHW;
+    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
+                                                           : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
 #else
     const bool fast_nhwc_batch_norm =
         dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
@@ -714,7 +938,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     const int num = transformed_x.numel();
+#ifdef HIPCC
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid1 = (num + block - 1) / block;
@@ -734,14 +962,15 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
 // ------------------- cudnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-      miopenTensorDescriptor_t data_desc_;
-      miopenTensorDescriptor_t bn_param_desc_;
-      miopenBatchNormMode_t mode_;
-
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
       cudnnTensorDescriptor_t data_desc_;
       cudnnTensorDescriptor_t bn_param_desc_;
@@ -759,7 +988,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       }
       epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 #ifdef PADDLE_WITH_HIP
-      mode_ = miopenBNSpatial;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
       if (FLAGS_cudnn_batchnorm_spatial_persistent) {
         mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -771,13 +1001,14 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 0, 1)
 
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-          const_cast<int *>(strides.data())));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
-                                                            data_desc_, mode_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
+//                                                       data_desc_, mode_));
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           data_desc_, CudnnDataType<T>::type,
@@ -871,20 +1102,49 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
         if (!called) {
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::miopenBatchNormalizationBackward(
-                  dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_d_y.template data<T>(), data_desc_,
-                  transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  d_scale->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  d_bias->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon, saved_mean_data, saved_var_data));
+          if (compute_format == DataLayout::kNCHW) {
+            BNBackward<
+                T, block,
+                DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
+                saved_var_data, C, N, H * W * D, epsilon,
+                transformed_d_x.template data<T>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()));
+          } else {
+            BNBackward<
+                T, block,
+                DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
+                saved_var_data, C, N, H * W * D, epsilon,
+                transformed_d_x.template data<T>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()));
+          }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
 #else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               platform::dynload::cudnnBatchNormalizationBackward(
@@ -931,11 +1191,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       }
 
 #ifdef PADDLE_WITH_HIP
-      // clean when exit.
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
       // clean when exit.
       PADDLE_ENFORCE_CUDA_SUCCESS(
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index 99153101fc326ca467ef47ac4733457fbd8335da..8bd2b7fe2d127c379e27e42f4c189c90fb7ebf8e 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -32,6 +32,11 @@ __global__ void GPUBCELossForward(const T* x_data, const T* label_data,
     T one = static_cast<T>(1.);
     T neg_100 = static_cast<T>(-100.);
 
+    PADDLE_ENFORCE(
+        (x >= static_cast<T>(0)) && (x <= one),
+        "Input is expected to be within the interval [0, 1], but recieved %f.",
+        x);
+
     T term1 = max(real_log(x), neg_100);
     T term2 = max(real_log(one - x), neg_100);
 
@@ -64,29 +69,13 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
     auto* labels = ctx.Input<Tensor>("Label");
     auto* out = ctx.Output<Tensor>("Out");
 
-    auto x_data = x->data<T>();
-    auto out_data = out->mutable_data<T>(ctx.GetPlace());
+    const auto* x_data = x->data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
     auto x_numel = x->numel();
 
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), x_numel);
-
-    Tensor x_cpu;
-    framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
-    T* x_cpu_data = x_cpu.data<T>();
-
-    for (int64_t i = 0; i < x_numel; ++i) {
-      PADDLE_ENFORCE_GE(
-          x_cpu_data[i], static_cast<T>(0),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be greater than  or equal to 0"));
-      PADDLE_ENFORCE_LE(
-          x_cpu_data[i], static_cast<T>(1),
-          platform::errors::InvalidArgument(
-              "Illegal input, input must be less than or equal to 1"));
-    }
-
     auto& dev_ctx = ctx.cuda_device_context();
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
 
     GPUBCELossForward<T><<<config.block_per_grid, config.thread_per_block, 0,
                            dev_ctx.stream()>>>(x_data, labels->data<T>(),
@@ -102,9 +91,10 @@ class BCELossGradCUDAKernel : public framework::OpKernel<T> {
     auto* labels = ctx.Input<Tensor>("Label");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     int x_numel = x->numel();
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+
     auto& dev_ctx = ctx.cuda_device_context();
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(dev_ctx, x_numel);
diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc
index c5cfa7a3bafce7b079cacaca4e57764439d4b282..40f4b969ec060d8453d176db67a6eb20933c6b3e 100644
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -97,5 +97,6 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
                        ops::CastOpKernel<CPU, bool>,
                        ops::CastOpKernel<CPU, uint8_t>,
                        ops::CastOpKernel<CPU, paddle::platform::float16>,
+                       ops::CastOpKernel<CPU, paddle::platform::bfloat16>,
                        ops::CastOpKernel<CPU, paddle::platform::complex64>,
                        ops::CastOpKernel<CPU, paddle::platform::complex128>);
diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h
index 8fa0416049f8fa128d7ab61f8350b41960f07263..cd60c7707cb0aaedd526480c088fabef88b5079f 100644
--- a/paddle/fluid/operators/cast_op.h
+++ b/paddle/fluid/operators/cast_op.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0de0f5e4505795f69f1d80e2bbc1600250fc7391
--- /dev/null
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -0,0 +1,100 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/cast_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+static std::map<framework::proto::VarType::Type, aclDataType>
+    DTYPE_2_ACL_DTYPE = {
+        {framework::proto::VarType::BOOL, ACL_BOOL},
+        {framework::proto::VarType::INT16, ACL_INT16},
+        {framework::proto::VarType::INT32, ACL_INT32},
+        {framework::proto::VarType::INT64, ACL_INT64},
+        {framework::proto::VarType::FP16, ACL_FLOAT16},
+        {framework::proto::VarType::FP32, ACL_FLOAT},
+        {framework::proto::VarType::FP64, ACL_DOUBLE},
+};
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CastNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    int dtype = ctx.Attr<int>("out_dtype");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto place = ctx.GetPlace();
+
+    if (x->type() == dtype) {
+      // NOTE(zhiqiu): NPU cast op may result in wrong value, so
+      // add special case here.
+      VLOG(4) << "cast to same dtype:" << dtype;
+      out->mutable_data(place, x->type());
+      framework::TensorCopy(
+          *x, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+      return;
+    }
+
+    auto iter = DTYPE_2_ACL_DTYPE.find(
+        static_cast<framework::proto::VarType::Type>(dtype));
+    int aclDtype = iter->second;
+
+    if (dtype == framework::proto::VarType::FP32) {
+      out->mutable_data<float>(place);
+    } else if (dtype == framework::proto::VarType::FP16) {
+      out->mutable_data<paddle::platform::float16>(place);
+    } else if (dtype == framework::proto::VarType::INT16) {
+      out->mutable_data<int16_t>(place);
+    } else if (dtype == framework::proto::VarType::INT32) {
+      out->mutable_data<int32_t>(place);
+    } else if (dtype == framework::proto::VarType::INT64) {
+      out->mutable_data<int64_t>(place);
+    } else if (dtype == framework::proto::VarType::FP64) {
+      out->mutable_data<double>(place);
+    } else if (dtype == framework::proto::VarType::BOOL) {
+      out->mutable_data<bool>(place);
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Cast", {*x}, {*out},
+                              {{"dst_type", static_cast<int32_t>(aclDtype)}});
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    cast, ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc
index bbd43274a002d85e38d05aa00dcc79f1e11308f7..ca15858cf67d756fc8eb41f4e26a2e0b923abef6 100644
--- a/paddle/fluid/operators/cast_op_xpu.cc
+++ b/paddle/fluid/operators/cast_op_xpu.cc
@@ -23,8 +23,22 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename T>
+class XPUFPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class XPUFPTypeTrait<platform::float16> {
+ public:
+  using Type = float16;
+};
+
 template <typename DeviceContext, typename InT>
 class CastXPUKernel : public framework::OpKernel<InT> {
+  using XPUInTDType = typename XPUFPTypeTrait<InT>::Type;
+
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
@@ -34,27 +48,39 @@ class CastXPUKernel : public framework::OpKernel<InT> {
     auto out_type = static_cast<framework::proto::VarType::Type>(
         context.Attr<int>("out_dtype"));
     auto* in_data = in->data<InT>();
+
+    // using XPUOutTDType = typename XPUFPTypeTrait<InT>::Type;
     auto numel = in->numel();
     auto& dev_ctx = context.template device_context<DeviceContext>();
     int r = -1;
     if (out_type == framework::proto::VarType::FP32) {
       auto* out_data = out->mutable_data<float>(context.GetPlace());
-      r = xpu::cast_v2<InT, float>(dev_ctx.x_context(), in_data, out_data,
-                                   numel);
+      r = xpu::cast_v2<XPUInTDType, float>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          out_data, numel);
     } else if (out_type == framework::proto::VarType::INT32) {
       auto* out_data = out->mutable_data<int>(context.GetPlace());
-      r = xpu::cast_v2<InT, int32_t>(dev_ctx.x_context(), in_data, out_data,
-                                     numel);
+      r = xpu::cast_v2<XPUInTDType, int32_t>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          out_data, numel);
     } else if (out_type == framework::proto::VarType::INT64) {
       auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
-      r = xpu::cast_v2<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
-                                     numel);
+      r = xpu::cast_v2<XPUInTDType, int64_t>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          out_data, numel);
     } else if ((out_type == framework::proto::VarType::BOOL) &&
                (in_type == framework::proto::VarType::FP32)) {
       auto* out_data = out->mutable_data<bool>(context.GetPlace());
       r = xpu::cast_v2<float, int8_t>(
           dev_ctx.x_context(), (const float*)in_data,
           reinterpret_cast<int8_t*>(out_data), numel);
+    } else if (out_type == framework::proto::VarType::FP16) {
+      auto* out_data =
+          out->mutable_data<paddle::platform::float16>(context.GetPlace());
+      r = xpu::cast_v2<XPUInTDType, float16>(
+          dev_ctx.x_context(), reinterpret_cast<const XPUInTDType*>(in_data),
+          reinterpret_cast<float16*>(out_data), numel);
+
     } else {
       PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
                                                  in_type, out_type));
@@ -75,5 +101,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
     cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int32_t>,
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::CastXPUKernel<paddle::platform::XPUDeviceContext,
+                       paddle::platform::float16>,
     ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
 #endif
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index eb27df8a36757cd01636d176655011739eae1d56..7176a0466bb831cdbbaf66dfbb2d2625bdbf66cf 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -145,10 +145,14 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer,
                   ops::ClipDoubleGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ClipKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 
 REGISTER_OP_VERSION(clip)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu
index d31b81c13c5cf64560d6d38c994b28515813a664..fd61e4ea61d4ff20656dea842b02958c8c2701b9 100644
--- a/paddle/fluid/operators/clip_op.cu
+++ b/paddle/fluid/operators/clip_op.cu
@@ -17,8 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ClipKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, int64_t>);
 
 REGISTER_OP_CUDA_KERNEL(
     clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 8920541b9b9dcc5c52d27804262bd9c5169444ea..3f210219608fb7efa740ce2d4a52c736acdfdcc9 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -11,7 +11,7 @@ foreach(src ${OPS})
     set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()
 
-register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 
 if(WITH_NCCL OR WITH_RCCL)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
@@ -29,5 +29,38 @@ if(WITH_XPU_BKCL)
     op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
+if(WITH_ASCEND_CL)
+    cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
+    op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+endif()
+
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
+
+if(WITH_ASCEND_CL)
+    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
+        gen_hccl_id_op op_registry ascend_hccl flags
+        dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
+        DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
+        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
+        DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
+        DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
+            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
+        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
+        DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
+        DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
+        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
+        DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc
index 86f1c28a9dd4f53400418c93f8598b7a9c38f4cc..63b135a74cf4b7b80b8baec462aa920fce370f7e 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <future>  // NOLINT
 #include <ostream>
 
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+#include "paddle/fluid/operators/collective/allreduce_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc
index 9b70f78399026b9f853b8315f0acf6dbad64242a..fe2e49105527065a8ccfd9e0b00cb88a1f879304 100644
--- a/paddle/fluid/operators/collective/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+#include "paddle/fluid/operators/collective/allreduce_op.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c57b9f99676337c88d6a51927195eeedb8b0a2a
--- /dev/null
+++ b/paddle/fluid/operators/collective/alltoall_op.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/alltoall_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AllToAllOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllToAll");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AllToAll");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for alltoall op must be non-negative.", ring_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class AllToAllOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor send.");
+    AddOutput("Out", "(Tensor) the result of alltoall.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+AllToAll Operator
+Scatter tensors from all participators to all participators.
+)DOC");
+  }
+};
+
+template <typename T>
+class AllToAllOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("alltoall");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(AllToAllInplaceInferer, {"X", "Out"});
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(alltoall, ops::AllToAllOp, ops::AllToAllOpMaker,
+                  ops::AllToAllOpGradMaker<paddle::framework::OpDesc>,
+                  ops::AllToAllOpGradMaker<paddle::imperative::OpBase>,
+                  ops::AllToAllInplaceInferer)
+
+REGISTER_OP_CPU_KERNEL(alltoall, ops::AllToAllOpCPUKernel<float>,
+                       ops::AllToAllOpCPUKernel<double>,
+                       ops::AllToAllOpCPUKernel<int>,
+                       ops::AllToAllOpCPUKernel<int64_t>,
+                       ops::AllToAllOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1bcb47fc686cfe4b93420697b15d0c2585f0358e
--- /dev/null
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/alltoall_op.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+#if NCCL_VERSION_CODE >= 2703
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int send_numel = x->numel();
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for alltoall op must be non-negative.", ring_id));
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    framework::DDim x_dims = x->dims();
+    framework::DDim out_dims(x_dims);
+    PADDLE_ENFORCE_EQ(
+        x_dims[0] % nranks, 0,
+        platform::errors::InvalidArgument(
+            "The first dimension size (%d) of the input tensor must be "
+            "divisible by the number of ranks (%d).",
+            x_dims[0], nranks));
+    auto send_buf = x->data<T>();
+    auto recv_buf = out->mutable_data<T>(out_dims, place);
+    size_t offset = 0;
+    send_numel /= nranks;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
+    for (auto i = 0; i < nranks; ++i) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+          send_buf + offset, send_numel, dtype, i, comm->comm(), stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv(
+          recv_buf + offset, send_numel, dtype, i, comm->comm(), stream));
+      offset += send_numel;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(alltoall, ops::AllToAllOpCUDAKernel<float>,
+                        ops::AllToAllOpCUDAKernel<double>,
+                        ops::AllToAllOpCUDAKernel<int>,
+                        ops::AllToAllOpCUDAKernel<int64_t>,
+                        ops::AllToAllOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.h b/paddle/fluid/operators/collective/alltoall_op.h
similarity index 51%
rename from paddle/fluid/operators/distributed_ops/split_byref_op.h
rename to paddle/fluid/operators/collective/alltoall_op.h
index fedd7218dd6cc9481e94a92a3820cafbe4157bd0..61eec44093794ccaf820d257d7c2c6b363e10391 100644
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.h
+++ b/paddle/fluid/operators/collective/alltoall_op.h
@@ -1,10 +1,10 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,28 +14,27 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+#include <utility>
 #include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-class SplitByrefOpKernel : public framework::OpKernel<T> {
+template <typename T>
+class AllToAllOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<framework::Tensor>("X");
-    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto place = ctx.GetPlace();
-
-    size_t row_offset = 0;
-    for (size_t i = 0; i < outs.size(); ++i) {
-      // NOTE: no need to call mutable_data here to allocate memory.
-      auto* out = outs[i];
-      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
-      *out = in->Slice(row_offset, row_offset + out->dims()[0]);
-      row_offset += out->dims()[0];
-    }
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support alltoall for cpu kernel now."));
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index 4111a19c5ebc8cecec02b5a08fbc3337ffa665a1..c4e779698cccafc6d958e823f087a1276b6246c3 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the allgather result");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7f05549d9efea8579e103826aca6664c8cd9f9b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
+#include <memory>
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    framework::DDim out_dims = in->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint64_t send_numel = in->numel();
+    void *send_buff = reinterpret_cast<void *>(const_cast<T *>(in->data<T>()));
+    void *recv_buff = reinterpret_cast<void *>(out->data<T>());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    VLOG(3) << "begin hccl allgather, parameter is: "
+            << ", group is " << group << ", ring_id is " << ring_id
+            << ", nranks is " << nranks;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
+        send_buff, recv_buff, send_numel, dtype, comm->comm(),
+        reinterpret_cast<void *>(stream)));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel<int8_t>,
+                       ops::CAllGatherOpASCENDKernel<int>,
+                       ops::CAllGatherOpASCENDKernel<float>,
+                       ops::CAllGatherOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c7dfc4aad7d0ec11486d12551d0670d38626579
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allgather);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allgather, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 1;
+  int num2 = 4;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size() * 2);
+  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_allgather, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllGatherOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index 835b49e57bc0922a0d0be7895b57275ba31d2173..8bdbdfac8ffd1d8294aca28e90a9b6471c0fc2a9 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -37,14 +37,19 @@ class CAllReduceMaxOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Max"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max, ops::CAllReduceOp,
-                             ops::CAllReduceMaxOpMaker);
+REGISTER_OPERATOR(
+    c_allreduce_max, ops::CAllReduceOp, ops::CAllReduceMaxOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::AllreduceMaxInplaceInferer)
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_max,
                        ops::CAllReduceOpCPUKernel<ops::kRedMax, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4dece4a3721ff5d557e1008bd70a8a8d4a4b1c58
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_max, ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b7fd2739d51181cd9bb774a31ecfb108a909f4c1
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -0,0 +1,188 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_max);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 100;
+  int num2 = 100;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id * 3);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 4.0);
+  }
+}
+
+TEST(c_allreduce_max, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllReduceOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
similarity index 55%
rename from paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
rename to paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
index 9b70f78399026b9f853b8315f0acf6dbad64242a..b0aa51f7cfdfdcd51db6b31a76a6c5c8b77b3d62 100644
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,14 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    allreduce, ops::AllReduceOpKernel<plat::CUDADeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_XPU_KERNEL(c_allreduce_max,
+                       ops::CAllReduceOpXPUKernel<ops::kRedMax, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
index efc19659c83ec35c9650d3184654f97d23940745..9d913b12b13767a1375b5b93bbcc483bdbd51a22 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -37,14 +37,19 @@ class CAllReduceMinOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Min"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceMinInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min, ops::CAllReduceOp,
-                             ops::CAllReduceMinOpMaker);
+REGISTER_OPERATOR(
+    c_allreduce_min, ops::CAllReduceOp, ops::CAllReduceMinOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::AllreduceMinInplaceInferer)
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_min,
                        ops::CAllReduceOpCPUKernel<ops::kRedMin, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..48e1d2eeb58c5227d57c95d9cb91028cc3266e55
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_min, ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2f16a89c217dacb1529ab2f57d300aceabb95a85
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_min,
+                       ops::CAllReduceOpXPUKernel<ops::kRedMin, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 2f56f43d793fa941e96e5711ac48eb2899290259..3a74f551e7a30ed64104f8054a4e063fa816944e 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -19,17 +19,31 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allreduce.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -105,6 +119,136 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto place = ctx.GetPlace();
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+    int64_t numel = in->numel();
+
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    out->mutable_data<T>(in->dims(), ctx.GetPlace());
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
+    switch (red_type) {
+      case kRedSum:
+        hccl_red_type = HCCL_REDUCE_SUM;
+        break;
+
+      case kRedMax:
+        hccl_red_type = HCCL_REDUCE_MAX;
+        break;
+
+      case kRedMin:
+        hccl_red_type = HCCL_REDUCE_MIN;
+        break;
+
+      case kRedProd:
+        hccl_red_type = HCCL_REDUCE_PROD;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    VLOG(3) << "begin hccl allreduce, parameter is: "
+            << "input num: " << numel << "dtype: " << dtype
+            << "hccl_red_type: " << hccl_red_type << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+
+    out->Resize(in->dims());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    BKCLDataType dtype = platform::ToBKCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::BKCLCommContext::Instance().Get(rid, place);
+
+    XPUStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+                   ->x_context()
+                   ->xpu_stream;
+    } else {
+      stream = comm->stream();
+    }
+
+    BKCLOp bkcl_red_type = BKCL_ADD;
+    switch (red_type) {
+      case kRedSum:
+        bkcl_red_type = BKCL_ADD;
+        break;
+
+      case kRedMax:
+        bkcl_red_type = BKCL_MAX;
+        break;
+
+      case kRedMin:
+        bkcl_red_type = BKCL_MIN;
+        break;
+
+      case kRedProd:
+        bkcl_red_type = BKCL_PRODUCT;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel,
+                                      dtype, bkcl_red_type, stream),
+                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
+                                        "BKCL all reduce failed"));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should be compiled with XPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -170,10 +314,20 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the allreduced result.");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "use_model_parallel",
+        "(bool default false) use this op with model parallel mode. In model "
+        "parallel mode, the backward is c_identity which returns itself for "
+        "c_allreduce_sum.")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 CAllReduce %s Operator
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index 5ab07ef026bac5bef7386b0789803933cd8fdf2a..3ad078e1c8ff0f3438eb1c74ddc82c537a73bf5f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -37,14 +37,19 @@ class CAllReduceProdOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Prod"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod, ops::CAllReduceOp,
-                             ops::CAllReduceProdOpMaker);
+REGISTER_OPERATOR(
+    c_allreduce_prod, ops::CAllReduceOp, ops::CAllReduceProdOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::AllreduceProdInplaceInferer)
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_prod,
                        ops::CAllReduceOpCPUKernel<ops::kRedProd, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3d14afe0a1bc7228b4cb8517e09633e916c46ac
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_prod, ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92ba00428065bc318a48e7e9e63910716015cbf7
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_prod,
+                       ops::CAllReduceOpXPUKernel<ops::kRedProd, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index 68061e6ae6bea097b7a2bc5ee19d58c05fd21848..18c317506c06e1fa099f872db46990a2155e3e40 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -37,7 +37,12 @@ class CAllReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("c_allreduce_sum");
+    bool use_mp = BOOST_GET_CONST(bool, this->GetAttr("use_model_parallel"));
+    if (use_mp) {
+      retv->SetType("c_identity");
+    } else {
+      retv->SetType("c_allreduce_sum");
+    }
     retv->SetInput("X", this->OutputGrad("Out"));
     retv->SetOutput("Out", this->InputGrad("X"));
     retv->SetAttrMap(this->Attrs());
@@ -49,6 +54,8 @@ class CAllReduceSumOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Sum"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceSumInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -58,7 +65,7 @@ namespace plat = paddle::platform;
 REGISTER_OPERATOR(c_allreduce_sum, ops::CAllReduceOp,
                   ops::CAllReduceSumOpGradMaker<paddle::framework::OpDesc>,
                   ops::CAllReduceSumOpGradMaker<paddle::imperative::OpBase>,
-                  ops::CAllReduceSumOpMaker);
+                  ops::CAllReduceSumOpMaker, ops::AllreduceSumInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_sum,
                        ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b66e2e1968908cacfca80dc6b6a1939fb0392c16
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_sum, ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1bf9683e35593720e9db604142312a055356bb0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
+                         int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 3.0);
+  }
+}
+
+TEST(c_allreduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  // only support one device, if more than one device, use first default
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for (int i = 0; i < 1; i++) {
+    VLOG(2) << "iter num: " << i;
+    TestHCCLAllReduceOp(&scope, ctx, i);
+  }
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4ec538cd2323009657ea85e0a5d59db0ea0d3c8
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_sum,
+                       ops::CAllReduceOpXPUKernel<ops::kRedSum, float>)
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc
index 928fa8549ffb9209dea975a049db4beed0add6b6..271d543eb2364d4a088291ce6838be3f4d455ff0 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
@@ -42,6 +42,10 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0);
     AddAttr<int>("root", "(int default 0) root id for broadcasting.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a60ba86572822c4390f23dd85ada1db35145ffdb
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int root = ctx.Attr<int>("root");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+
+    VLOG(3) << "begin hccl broadcast, parameter is: "
+            << "root " << root << ", group is " << group
+            << ", comm: " << comm->comm() << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+            << framework::product(out->dims());
+
+    dev_ctx->Wait();
+
+    if (out != x) {
+      framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
+                            *platform::DeviceContextPool::Instance().Get(place),
+                            static_cast<framework::Tensor*>(out));
+    }
+    dev_ctx->Wait();
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_broadcast, ops::CBroadcastOpASCENDKernel<int>,
+                       ops::CBroadcastOpASCENDKernel<int8_t>,
+                       ops::CBroadcastOpASCENDKernel<float>,
+                       ops::CBroadcastOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e39613f3fbe3a9277402765c4f4c7a140b9be23
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_broadcast);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+TEST(c_broadcast, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7817f19bacb1879517d4865165836f46e4b68e75
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class CCommInitOpAscend : public framework::OperatorBase {
+ public:
+  CCommInitOpAscend(const std::string& type,
+                    const framework::VariableNameMap& inputs,
+                    const framework::VariableNameMap& outputs,
+                    const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "CCommInitOpAscend can run on npu place only."));
+
+    auto var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input con not be empty."));
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
+
+    int rank_ids = Attr<int>("rank_ids");
+    int rank_id = Attr<int>("rank");
+    int rid = Attr<int>("ring_id");
+    int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+    if (Attr<int>("device_id") >= 0) {
+      device_id = Attr<int>("device_id");
+    }
+    platform::HCCLCommContext::Instance().CreateHCCLComm(
+        hccl_id, rank_ids, rank_id, device_id, rid);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CCommInit operator
+
+Initialize collective communicatoin context within this trainer
+)DOC");
+    AddAttr<int>("rank_ids",
+                 "(int) The number of ranks of distributed trainers");
+    AddAttr<int>("rank",
+                 "(int) The rank of the trainer in distributed training.");
+    AddAttr<int>("device_id",
+                 "(int) The deivce_id on which to initialize the communicator."
+                 "Now, you only have to set this attr manually for pipeline "
+                 "training. Otherwise, make it as default.")
+        .SetDefault(-1);
+    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend,
+                  ops::CCommInitOpAscendMaker);
diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..551fde21162582fbbb2b356a2aa265247a4af94d
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_concat");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_concat");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    int rank = ctx->Attrs().Get<int>("rank");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
+                                     "The number of ranks (%d) for c_concat "
+                                     "must be greater than 1.",
+                                     nranks));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_concat must be non-negative.", ring_id));
+    PADDLE_ENFORCE_GE(
+        rank, 0, platform::errors::InvalidArgument(
+                     "The rank (%d) for c_concat must be non-negative.", rank));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::InvalidArgument(
+                          "The value of rank (%d) for c_concat must "
+                          "be less than that of nranks.",
+                          rank, nranks));
+
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[dim.size() - 1] = dim[dim.size() - 1] * nranks;
+    if (dim[dim.size() - 1] < 0) dim[dim.size() - 1] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CConcatOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_split");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class CConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be concated.");
+    AddOutput("Out", "(Tensor) the result of concat.");
+    AddAttr<int>("rank", "(int default 0) rank id.").SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(1);
+    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default true) eject CUDA operations to calculation stream.")
+        .SetDefault(true);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default true) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CConcat Operator
+AllGather the tensors on different trainers and concat them along the last dimension.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_concat, ops::CConcatOp,
+                  ops::CConcatOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CConcatOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CConcatOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_concat, ops::CConcatOpCPUKernel<float>,
+                       ops::CConcatOpCPUKernel<double>,
+                       ops::CConcatOpCPUKernel<int>,
+                       ops::CConcatOpCPUKernel<int64_t>,
+                       ops::CConcatOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bfdc49c440aae76a2aa9cebae82a419b471f4662
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CConcatOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rank = ctx.Attr<int>("rank");
+    int rid = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_GE(rank, 0,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_concat must be "
+                          "greater than or equal to 0.",
+                          rank));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::PreconditionNotMet(
+                          "The value of nranks (%d) for c_concat must be "
+                          "greater than or equal to 2.",
+                          nranks));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_concat must be "
+                          "less than that of nranks (%d).",
+                          rank, nranks));
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    PADDLE_ENFORCE_EQ(
+        nranks, comm->nranks(),
+        platform::errors::InvalidArgument("nranks: %s should equal to %s",
+                                          nranks, comm->nranks()));
+
+    framework::Tensor temp_out;
+    framework::DDim temp_out_dims = x->dims();
+    temp_out_dims[0] *= nranks;
+    temp_out.mutable_data<T>(temp_out_dims, place);
+    int64_t send_numel = x->numel();
+    const T* send_buff = x->data<T>();
+    T* recv_buff = temp_out.data<T>();
+    gpuStream_t stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+        send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
+        comm->comm(), stream));
+
+    std::vector<framework::Tensor> inputs;
+    int axis = x->dims().size() - 1;
+    auto out_dims = x->dims();
+    out_dims[out_dims.size() - 1] *= nranks;
+    int rows_per_tensor = x->dims()[0];
+    int offset = 0;
+    for (int i = 0; i < nranks; i++) {
+      framework::Tensor temp = temp_out.Slice(offset, offset + rows_per_tensor);
+      inputs.emplace_back(temp);
+      offset += rows_per_tensor;
+    }
+
+    math::ConcatFunctor<platform::CUDADeviceContext, T> functor;
+    out->mutable_data<T>(out_dims, place);
+    auto& dev_ctx2 = ctx.template device_context<platform::CUDADeviceContext>();
+    functor(dev_ctx2, inputs, axis, out);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_concat, ops::CConcatOpCUDAKernel<float>,
+                        ops::CConcatOpCUDAKernel<double>,
+                        ops::CConcatOpCUDAKernel<int>,
+                        ops::CConcatOpCUDAKernel<int64_t>,
+                        ops::CConcatOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_concat_op.h b/paddle/fluid/operators/collective/c_concat_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..55a5799e37b6f5728793d0a03cb7e7a01558f2cb
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CConcatOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_concat for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..593eaf923a978402cc7607bb7d2bc4a6419dd2cb
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int rank = Attr<int>("rank");
+    framework::Scope& local_scope = scope.NewScope();
+
+    std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
+      return Output("Out");
+    };
+
+    if (rank == 0) {
+      std::vector<std::string> endpoint_list =
+          Attr<std::vector<std::string>>("other_endpoints");
+      SendBroadCastHCCLID(endpoint_list, 1, func, local_scope);
+    } else {
+      std::string endpoint = Attr<std::string>("endpoint");
+      RecvBroadCastHCCLID(endpoint, 1, func, local_scope);
+    }
+    scope.DeleteScope(&local_scope);
+  }
+};
+
+#else
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
+class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    VLOG(3) << "ele";
+    AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CGenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
+    AddAttr<std::vector<std::string>>(
+        "other_endpoints",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of other trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("rank",
+                 "(int default 0) "
+                 "The rank of the trainer in distributed training.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 485a6d7ec4ed3575b6a35b74869d611a6678e2c5..470537582e97838322de2dabdd880b254d6401c9 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -65,6 +66,9 @@ class CGenNCCLIdOp : public framework::OperatorBase {
       return Output("Out");
     };
 
+    std::string endpoint = Attr<std::string>("endpoint");
+    int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
+
     std::vector<ncclUniqueId> nccl_ids;
     nccl_ids.resize(1);
 
@@ -74,8 +78,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
           Attr<std::vector<std::string>>("other_endpoints");
       platform::SendBroadCastCommID(endpoint_list, &nccl_ids);
     } else {
-      std::string endpoint = Attr<std::string>("endpoint");
-      platform::RecvBroadCastCommID(endpoint, &nccl_ids);
+      platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
     }
 
     CopyNCCLIDToVar(nccl_ids, func, scope);
@@ -83,6 +86,21 @@ class CGenNCCLIdOp : public framework::OperatorBase {
   }
 };
 
+#else
+class CGenNCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenNCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
 class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/collective/c_identity_op.cc b/paddle/fluid/operators/collective/c_identity_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..646c27b90e17ea316d31ae2199d8433b855efdd4
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CIdentityOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_identity");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_identity");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_identity must be non-negative.", ring_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CIdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) identity tensor.");
+    AddOutput("Out", "(Tensor) identity tensor.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default true) eject CUDA operations to calculation stream.")
+        .SetDefault(true);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default true) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+Identity Operator which returns a copy of itself.
+)DOC");
+  }
+};
+
+template <typename T>
+class CIdentityOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_allreduce_sum");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_identity, ops::CIdentityOp,
+                  ops::CIdentityOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CIdentityOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CIdentityOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_identity, ops::CIdentityOpCPUKernel<float>,
+                       ops::CIdentityOpCPUKernel<double>,
+                       ops::CIdentityOpCPUKernel<int>,
+                       ops::CIdentityOpCPUKernel<int64_t>,
+                       ops::CIdentityOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..05bb3830b601fbb6cb9be38de258b56776fafad4
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpKernel<float>,
+                        ops::CIdentityOpKernel<double>,
+                        ops::CIdentityOpKernel<int>,
+                        ops::CIdentityOpKernel<int64_t>,
+                        ops::CIdentityOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8577a9617489887167dbc7d9ae008608f1be48e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CIdentityOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_identity for cpu kernel now."));
+  }
+};
+
+template <typename T>
+class CIdentityOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    int rid = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        rid, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_identity op must be non-negative.", rid));
+    out->mutable_data<T>(ctx.GetPlace());
+
+    TensorCopy(*x, out->place(), out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_identity_op_npu.cc b/paddle/fluid/operators/collective/c_identity_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a822bd11a4a8332111d6c0813a377fa214a0c390
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op_npu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_identity, ops::CIdentityOpKernel<float>,
+                       ops::CIdentityOpKernel<double>,
+                       ops::CIdentityOpKernel<int>,
+                       ops::CIdentityOpKernel<int64_t>,
+                       ops::CIdentityOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f35b4c2f707226f38c37c350dcb6a76c160ff50f
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
similarity index 54%
rename from paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
rename to paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
index 056659c3ea61f6233a6dda56ca1e272e72770d4a..6d3af7bb5f258b425a8618e412c3bb5552113bfe 100644
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
@@ -1,10 +1,10 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,8 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split_byref,
-    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpXPUKernel<ops::kRedMax, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6ebb7e4c40e68e961059fdd41566492a5274bc31
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..791e58d8493cec454e2fe74c772ec70999cc8f36
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpXPUKernel<ops::kRedMin, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 1bce01e13a2ad25638128f4f619f458348d97b5e..fa9fd079d8e48b053deaa12fb3d61b0c00713b22 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -24,15 +24,28 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
+
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/reduce.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -110,6 +123,148 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CReduceOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto place = ctx.GetPlace();
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+    int64_t numel = in->numel();
+
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    int root_id = ctx.Attr<int>("root_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int rank_id = comm->rank();
+
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
+    switch (red_type) {
+      case kRedSum:
+        hccl_red_type = HCCL_REDUCE_SUM;
+        break;
+
+      case kRedMax:
+        hccl_red_type = HCCL_REDUCE_MAX;
+        break;
+
+      case kRedMin:
+        hccl_red_type = HCCL_REDUCE_MIN;
+        break;
+
+      case kRedProd:
+        hccl_red_type = HCCL_REDUCE_PROD;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    VLOG(3) << "begin hccl reduce, parameter is: "
+            << "input num: " << numel << "root_id: " << root_id
+            << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type
+            << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+
+    if (rank_id != root_id) {
+      auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
+      memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
+                   npu_place,
+                   reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
+                   numel * sizeof(T), stream);
+    }
+
+    out->Resize(in->dims());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CReduceOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    BKCLDataType dtype = platform::ToBKCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    int root = ctx.Attr<int>("root_id");
+    auto comm = platform::BKCLCommContext::Instance().Get(rid, place);
+
+    XPUStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+                   ->x_context()
+                   ->xpu_stream;
+    } else {
+      stream = comm->stream();
+    }
+
+    BKCLOp bkcl_red_type = BKCL_ADD;
+    switch (red_type) {
+      case kRedSum:
+        bkcl_red_type = BKCL_ADD;
+        break;
+
+      case kRedMax:
+        bkcl_red_type = BKCL_MAX;
+        break;
+
+      case kRedMin:
+        bkcl_red_type = BKCL_MIN;
+        break;
+
+      case kRedProd:
+        bkcl_red_type = BKCL_PRODUCT;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel,
+                                  dtype, bkcl_red_type, root, stream),
+                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
+                                        "BKCL all reduce failed"));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should be compiled with XPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -179,6 +334,10 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the reduced result.");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for reduce.")
+        .SetDefault("tag");
+#endif
     AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
     AddAttr<bool>(
         "use_calc_stream",
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f0b7021e7997d9298bf350b387cd3521a4eb035d
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7e770e8ffdcaf36408a13c7e01a31ebd1eced20
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpXPUKernel<ops::kRedProd, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd4dbbd5f364575258ca23991bda68482cdca0f3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3683c7722ba3bfbfbd12dbf7b1a1688fa7446708
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_reduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+  int root_id = 0;
+  attrs["root_id"] = root_id;
+
+  auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    if (rank_id == root_id) {
+      EXPECT_EQ(out_vec[i], 3.0);
+    } else {
+      EXPECT_EQ(out_vec[i], init[i]);
+    }
+  }
+}
+
+TEST(c_reduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for (int i = 0; i < 2; i++) {
+    VLOG(2) << "iter num: " << i;
+    TestHCCLReduceOp(&scope, ctx, i);
+  }
+}
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0ec4d2a99cd711f315186f7ce8966585685aeef
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpXPUKernel<ops::kRedSum, float>)
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc
index ada1fd2b1270ccbd530a5a420248bb12d2707ffd..7836f11dc9b1fb9e6bb426e07450098ca3667535 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
@@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nranks",
                  "Total trainer count of the distributed training job")
         .SetDefault(1);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for reduce scatter.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h
index 366d8a3747cfb7a88c697100ed4a49af00ee06e6..490b152bc2d302c29701a7cadfe91f2dc5bd8c45 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44096a82c34d61bc75fc494145d7d52e63cd0aa3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    auto out_dims = in->dims();
+    PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's "
+                          "dim[0] (%d) should be divisible by nranks(%d)",
+                          out_dims[0], nranks));
+
+    out_dims[0] = out_dims[0] / nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint64_t recv_numel = in->numel() / nranks;
+
+    void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* outputPtr = reinterpret_cast<void*>(out->data<T>());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    VLOG(3) << "begin hccl reduce scatter, parameter is: "
+            << "recv_numel: " << recv_numel << "dtype: " << dtype
+            << "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter(
+        inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reducescatter,
+                       ops::CReduceScatterOpAscendKernel<int8_t>,
+                       ops::CReduceScatterOpAscendKernel<int>,
+                       ops::CReduceScatterOpAscendKernel<float>,
+                       ops::CReduceScatterOpAscendKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f82f050a7206fe73f110df8f78989ad6181de84d
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_reducescatter);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int num1 = 4;
+  int num2 = 1;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  int iter_num = 10;
+  for (int i = 0; i < iter_num; i++) {
+    op->Run(*scope, place);
+    ctx.Wait();
+  }
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size() / 2);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_reducescatter, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLReduceScatterOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03046d571d0f0542ff714868205d5a0aa285e685
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_split_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CSplitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_split");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_split");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    int rank = ctx->Attrs().Get<int>("rank");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
+                                     "The number of ranks (%d) for c_split "
+                                     "must be greater than 1.",
+                                     nranks));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_split must be non-negative.", ring_id));
+    PADDLE_ENFORCE_GE(
+        rank, 0, platform::errors::InvalidArgument(
+                     "The rank (%d) for c_split must be non-negative.", rank));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::InvalidArgument(
+                          "The value of rank (%d) for c_split must "
+                          "be less than that of nranks.",
+                          rank, nranks));
+
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CSplitOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_allgather");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class CSplitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be split.");
+    AddOutput("Out", "(Tensor) the result of split.");
+    AddAttr<int>("rank", "(int default 0) rank id.").SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(1);
+    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default false) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CSplit Operator
+Split the tensor evenly according to its rank.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_split, ops::CSplitOp,
+                  ops::CSplitOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CSplitOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CSplitOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_split, ops::CSplitOpCPUKernel<float>,
+                       ops::CSplitOpCPUKernel<double>,
+                       ops::CSplitOpCPUKernel<int>,
+                       ops::CSplitOpCPUKernel<int64_t>,
+                       ops::CSplitOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92a7f5e41b1d2d8a1e3f4582ad014f630010c8ca
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.cu.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/fluid/operators/collective/c_split_op.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSplitOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rank = ctx.Attr<int>("rank");
+    auto place = ctx.GetPlace();
+
+    PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet(
+                                   "The value of rank (%d) for c_split must be "
+                                   "greater than or equal to 0.",
+                                   rank));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::PreconditionNotMet(
+                          "The value of nranks (%d) for c_split must be "
+                          "greater than or equal to 2.",
+                          nranks));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_split must be "
+                          "less than that of nranks (%d).",
+                          rank, nranks));
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    std::vector<const framework::Tensor*> shape_refer;
+    std::vector<framework::Tensor*> results;
+    size_t numel = x->numel();
+    auto dims = x->dims();
+    numel /= nranks;
+    int axis = dims.size() - 1;
+    dims[dims.size() - 1] /= nranks;
+    for (int i = 0; i < nranks; i++) {
+      framework::Tensor* out = new framework::Tensor();
+      out->mutable_data<T>(dims, place);
+      shape_refer.emplace_back(out);
+      results.emplace_back(out);
+    }
+
+    math::SplitFunctor<platform::CUDADeviceContext, T> functor;
+    functor(dev_ctx, *x, shape_refer, axis, &results);
+    out->mutable_data<T>(dims, place);
+    paddle::framework::TensorCopySync(*results[rank], out->place(), out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_split, ops::CSplitOpCUDAKernel<float>,
+                        ops::CSplitOpCUDAKernel<double>,
+                        ops::CSplitOpCUDAKernel<int>,
+                        ops::CSplitOpCUDAKernel<int64_t>,
+                        ops::CSplitOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_split_op.h b/paddle/fluid/operators/collective/c_split_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea0c7fc45c66b8bfeb51d88999f57c1e94eb2ab8
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSplitOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_split for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index c4abe284d720963deecaabdb30f96f9b55e4753b..71ab25a7b0ff8a490d7de0022f810009a58482d4 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -15,40 +15,20 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
-class CSyncCalcStreamOp : public framework::OperatorBase {
+class CSyncCalcStreamOp : public framework::OperatorWithKernel {
  public:
-  CSyncCalcStreamOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on gpu place only for now."));
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
-    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
-#endif
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
   }
 };
 
@@ -65,10 +45,47 @@ Call calculation stream synchronization.
   }
 };
 
+template <typename T>
+class CSyncCalcStreamKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+
+    auto place = ctx.GetPlace();
+    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
+#endif
+
+#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+
+    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(dev_ctx->stream()));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(c_sync_calc_stream, ops::CSyncCalcStreamOp,
-                  ops::CSyncCalcStreamOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp,
+                             ops::CSyncCalcStreamOpMaker);
+
+REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
+
+REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45613715b8260c3f38968e5cd91f245cd9f524d5
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(elementwise_add);
+USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
+USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto y = scope->Var("Y");
+  auto tensor_y = y->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  std::vector<T> init_y;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_y.push_back(static_cast<T>(2.0));
+  }
+
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+  TensorFromVector(init_y, ctx, tensor_y);
+  tensor_y->Resize({10, 10});
+
+  f::AttributeMap attrs;
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // sync data
+  auto sync_op0 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                          {{"Out", {"Out"}}}, attrs);
+  sync_op0->Run(*scope, place);
+
+  // run
+
+  auto op =
+      f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}},
+                              {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  // sync op run
+  auto sync_op = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  sync_op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  // sync op copy
+  auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                          {{"Out", {"Out"}}}, attrs);
+  sync_op2->Run(*scope, place);
+
+  float expected = 3.0;
+
+  EXPECT_EQ(out_vec.size(), init_x.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
+  }
+}
+
+TEST(c_sync_calc_stream, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index adf27069f524e45c52ba30d9ee3e6920c7ea7751..71fda2cd01c8d6007cab19ebeea365467e8e7a99 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -14,45 +14,30 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
 #endif
 
 namespace paddle {
 namespace operators {
 
-class CSyncCommStreamOp : public framework::OperatorBase {
+class CSyncCommStreamOp : public framework::OperatorWithKernel {
  public:
-  CSyncCommStreamOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on gpu place only for now."));
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    int ring_id = Attr<int>("ring_id");
-    auto stream =
-        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
-#ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
   }
 };
 
@@ -72,10 +57,46 @@ Call communication stream synchronization.
   }
 };
 
+template <typename T>
+class CSyncCommStreamKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
+
+#ifdef PADDLE_WITH_RCCL
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
+
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(c_sync_comm_stream, ops::CSyncCommStreamOp,
-                  ops::CSyncCommStreamOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp,
+                             ops::CSyncCommStreamOpMaker);
+
+REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
+
+REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c5a6db61483dcd7e3578ded6a12a8a421ca1933
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_broadcast);
+USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+  std::cout << "rank_id:" << rank_id << std::endl;
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+    std::cout << init[0];
+  }
+  std::cout << std::endl;
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  // comm sync
+
+  auto sync_op = f::OpRegistry::CreateOp(
+      "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+  sync_op->Run(*scope, place);
+
+  // ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+TEST(c_sync_comm_stream_op, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0cb2dd188725f8f582a20318fbe4845b39047d3e
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    std::vector<std::string> trainers =
+        Attr<std::vector<std::string>>("trainers");
+    int trainer_id = Attr<int>("trainer_id");
+    std::string endpoint = trainers[trainer_id];
+
+    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
+                                         "trainer_id %d is less than 0. Its "
+                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_LT(
+        trainer_id, static_cast<int>(trainers.size()),
+        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
+                                     "range is [0, trainer_size)",
+                                     trainer_id));
+
+    int hccl_comm_num = Attr<int>("hccl_comm_num");
+    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
+    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
+    int inter_trainer_id = -1;
+    int exter_trainer_id = -1;
+
+    if (use_hierarchical_allreduce) {
+      PADDLE_ENFORCE_GT(
+          trainers.size(), 1,
+          platform::errors::PreconditionNotMet(
+              "The number of collective trainers %llu <= 1", trainers.size()));
+      PADDLE_ENFORCE_GT(
+          inter_nranks, 1,
+          platform::errors::PreconditionNotMet(
+              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
+              inter_nranks));
+      PADDLE_ENFORCE_EQ(
+          trainers.size() % inter_nranks, 0,
+          platform::errors::PreconditionNotMet(
+              "The number of trainers %llu mod inter_nranks %d is not equal 0",
+              trainers.size(), inter_nranks));
+
+      inter_trainer_id = trainer_id % inter_nranks;
+
+      if (trainer_id % inter_nranks == 0) {
+        exter_trainer_id = trainer_id / inter_nranks;
+      }
+    }
+
+    std::ostringstream ss;
+    for (size_t i = 0; i < trainers.size(); i++) {
+      ss << trainers[i] << ",";
+    }
+
+    VLOG(1) << "trainer_id:" << trainer_id
+            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
+            << ", hccl_comm_num:" << hccl_comm_num
+            << ", inter_nranks:" << inter_nranks
+            << ", inter_trainer_id:" << inter_trainer_id
+            << ", exter_trainer_id:" << exter_trainer_id
+            << ", trainers:" << ss.str();
+
+    int server_fd = -1;
+
+    /// 1. init flat
+    std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
+    if (trainer_id == 0) {
+      // server endpoints
+      std::vector<std::string> flat_endpoints;
+      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
+                            trainers.end());
+      SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
+    } else {
+      server_fd = CreateListenSocket(endpoint);
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 2. hierarchical inter ncclid
+    func = platform::GetHierarchicalInterHCCLVarName;
+    if (inter_trainer_id == 0) {
+      std::ostringstream ss;
+      ss << endpoint;
+      std::vector<std::string> inter_endpoints;
+      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
+                                   i < static_cast<int>(trainers.size());
+           i++) {
+        ss << ",";
+        inter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
+    } else if (inter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical inter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 3. hierarchical exter ncclid
+    func = platform::GetHierarchicalExterHCCLVarName;
+    if (exter_trainer_id == 0) {
+      std::ostringstream ss;
+      std::vector<std::string> exter_endpoints;
+      ss << endpoint;
+      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
+        ss << ",";
+        exter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
+    } else if (exter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical exter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    // close socket server
+    if (trainer_id != 0) {
+      CloseSocket(server_fd);
+    }
+  }
+};
+
+#else
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
+class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+GenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::vector<std::string>>(
+        "trainers",
+        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
+        "list of all trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("trainer_id",
+                 "(int) "
+                 "The index of the trainer in distributed training.");
+    AddAttr<int>("hccl_comm_num",
+                 "(int default 1) "
+                 "The number of nccl communicator num.")
+        .SetDefault(1);
+    AddAttr<bool>("use_hierarchical_allreduce",
+                  "(bool default false) "
+                  "Wheter to use hierarchical allreduce.")
+        .SetDefault(false);
+    AddAttr<int>("hierarchical_allreduce_inter_nranks",
+                 "(int default 1) "
+                 "Wheter to use hierarchical allreduce.")
+        .SetDefault(-1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..15940a76f71105a57865c0c8e00b404d087e9485
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -0,0 +1,350 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <algorithm>
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/split.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
+#define HCCL_UNIQUE_ID_BYTES 1024
+
+// Check system calls, such as socket, bind.
+#define CHECK_SYS_CALL(call, name)          \
+  do {                                      \
+    int retval;                             \
+    CHECK_SYS_CALL_VAL(call, name, retval); \
+  } while (false)
+
+#define CHECK_SYS_CALL_VAL(call, name, retval)                            \
+  do {                                                                    \
+    RETRY_SYS_CALL_VAL(call, name, retval);                               \
+    if (retval == -1) {                                                   \
+      PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \
+                                                 name, strerror(errno))); \
+    }                                                                     \
+  } while (false)
+
+#define RETRY_SYS_CALL_VAL(call, name, retval)                           \
+  do {                                                                   \
+    retval = (call);                                                     \
+    if (retval == -1 &&                                                  \
+        (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {   \
+      LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
+                   << " retry";                                          \
+    } else {                                                             \
+      break;                                                             \
+    }                                                                    \
+  } while (true)
+
+static int SocketSend(int fd, const char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = send(fd, buffer + offset, size - offset, 0);
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        // send failed
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static int SocketRecv(int fd, char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = recv(fd, buffer + offset, size - offset, 0);
+    if (bytes == 0) {
+      // closed by client, maybe probing alive client
+      return 0;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static void BindOrConnectFailed(int timeout, int* try_times, int* total_time,
+                                const char* op, const std::string& ep) {
+  PADDLE_ENFORCE_LT(
+      *total_time, timeout,
+      platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op,
+                                    ep.c_str(), strerror(errno)));
+  ++(*try_times);
+  int retry_time = std::min(*try_times * 500, 3000);  // max 3 seconds
+  *total_time += retry_time;
+
+  LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
+               << " times with reason: " << strerror(errno) << " retry after "
+               << retry_time / 1000.0 << " seconds";
+  std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
+}
+
+int CreateListenSocket(const std::string& ep) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  // creating socket fd
+  int server_fd = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
+
+  // NOTE. Solutions to `Address already in use`.
+  // 1. Reuse addr&port. Otherwise, once the server closes the socket
+  // before client, the server will enter TIME-WAIT status. If we bind port
+  // again, the error `Address already in use` will appear.
+  // 2. Or we can close the client first to ensure that the server does
+  // not enter the TIME-WAIT state. But this is obviously not as convenient
+  // as the reuse method.
+  int opt = 1;
+#if defined(SO_REUSEPORT)
+  // since Linux kernel 3.9
+  CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
+                            &opt, sizeof(opt)),
+                 "setsockopt");
+#else
+  CHECK_SYS_CALL(
+      setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
+      "setsockopt");
+#endif
+
+  struct sockaddr_in address;
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind",
+        ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
+      continue;
+    }
+    break;
+  }
+
+  CHECK_SYS_CALL(listen(server_fd, 3), "listen");
+  LOG(INFO) << "Server listening on: " << ep << " successful.";
+  return server_fd;
+}
+
+void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
+
+static int SocketAccept(int server_fd, const char* head) {
+  struct sockaddr_in client_addr;
+  socklen_t addr_length = sizeof(client_addr);
+  char buffer[1024] = {0};
+  int conn = -1;
+
+  while (true) {
+    CHECK_SYS_CALL_VAL(
+        accept(server_fd, reinterpret_cast<struct sockaddr*>(&client_addr),
+               &addr_length),
+        "accept", conn);
+
+    int ret_val = SocketRecv(conn, buffer, strlen(head));
+    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
+      break;  // accept client
+    } else {
+      VLOG(3) << "socket read failed with ret_val=" << ret_val;
+      CloseSocket(conn);
+    }
+  }
+  return conn;
+}
+
+static int ConnectAddr(const std::string& ep, const char* head) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  int sock = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
+
+  struct sockaddr_in server_addr;
+  memset(&server_addr, 0, sizeof(server_addr));
+  server_addr.sin_family = AF_INET;
+  server_addr.sin_port = htons(port);
+
+  char* ip = NULL;
+  struct hostent* hp = NULL;
+  hp = gethostbyname(host.c_str());
+  PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
+                                  "Fail to get host by name %s.", host));
+
+  int i = 0;
+  while (hp->h_addr_list[i] != NULL) {
+    ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
+    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
+    break;
+  }
+
+  PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0,
+                    platform::errors::Unavailable("Open address %s failed: %s",
+                                                  ep, strerror(errno)));
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
+        "connect", ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
+      continue;
+    }
+
+    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
+    break;
+  }
+  return sock;
+}
+
+static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
+                "hccl id bytes must <= buffer size");
+
+  CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "recv hccl id");
+  memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
+}
+
+static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
+
+  CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "send hccl id");
+}
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  // connect with server
+  std::vector<int> connects;
+  for (auto server : servers) {
+    VLOG(3) << "connecting endpoint: " << server;
+    int conn = ConnectAddr(server, COMM_HEAD);
+    connects.push_back(conn);
+  }
+  VLOG(3) << "connecting completed...";
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
+
+    int j = 0;
+    for (auto conn : connects) {
+      VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
+              << " hccl_comm_no: " << i;
+      SendHCCLID(conn, hccl_id);
+      ++j;
+    }
+    VLOG(3) << "sending completed...";
+  }
+
+  // close client
+  for (auto conn : connects) {
+    CloseSocket(conn);
+  }
+}
+
+void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int server = CreateListenSocket(endpoint);
+  RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
+  CloseSocket(server);
+}
+
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int client = SocketAccept(server_fd, COMM_HEAD);
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+
+    VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
+            << " from trainer 0, hccl_comm_no: " << i;
+    RecvHCCLID(client, hccl_id);
+  }
+  VLOG(3) << "receiving completed...";
+  CloseSocket(client);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ad6f791e1fc34d71b982c24cd04c938db71e82f
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+int CreateListenSocket(const std::string& ep);
+
+void CloseSocket(int fd);
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// server listen on endpoint, then recv nccl id
+void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// recv nccl id from socket
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index 679713d05bcb4025e1d204f62a72c1d3f647316a..99a92469e8502bbc500d627899f3c56fa6bccd66 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -34,6 +34,7 @@ class Scope;
 namespace paddle {
 namespace operators {
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -194,6 +195,20 @@ class GenNCCLIdOp : public framework::OperatorBase {
   }
 };
 
+#else
+class GenNCCLIdOp : public framework::OperatorBase {
+ public:
+  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
 class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 0ae7b821617f9144c09c6912be47c2be7e38da69..39a9ed0c74ef59d8520147572b9ab0da8c567da2 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -70,6 +70,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
     AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
         .SetDefault(5);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
         .SetDefault(std::vector<int>());
     AddAttr<bool>(
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69f1f4681a33d68d9a4d0efa09bd33d01834cff6
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CRecvOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Output<framework::LoDTensor>("Out");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int nranks = comm->nranks();
+    int peer = ctx.Attr<int>("peer");
+
+    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
+                                     "The nranks must be 2, but (%d)", nranks));
+
+    int root = peer;
+
+    VLOG(3) << "begin hccl recv, parameter is: "
+            << "root " << root << ", comm: " << comm->comm()
+            << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel<int>,
+                       ops::CRecvOpASCENDKernel<int8_t>,
+                       ops::CRecvOpASCENDKernel<float>,
+                       ops::CRecvOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..384dfd1fc5f2d3140d5d9278624a65348cc72132
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(recv_v2);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(recv_v2, NPU);
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+
+  int num = atoi(getenv("DATA_SIZE"));
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank_id:" << rank_id << std::endl;
+
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Data");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("SRC_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+  std::vector<int> out_shape;
+  out_shape.push_back(num);
+  out_shape.push_back(num);
+  attrs["out_shape"] = out_shape;
+
+  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
+  VLOG(3) << "CreateOp recv_v2";
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "Run op recv_v2";
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  EXPECT_EQ(out_vec == init, true);
+}
+
+TEST(recv_v2, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHcomRecvOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
index c5a86b4f08813a3bb548bec829b7d07f16681043..c60d560e43baed37d1fc4392e8afc356ffdbd949 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
         .SetDefault(0);
     AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ade090fcaac073875de1c1822fa0c45ad4f674b
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSendOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int nranks = comm->nranks();
+    int rank = comm->rank();
+
+    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
+                                     "The nranks must be 2, but (%d)", nranks));
+
+    int root = rank;
+
+    VLOG(3) << "begin hccl send, parameter is: "
+            << "root " << root << ", comm: " << comm->comm()
+            << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel<int>,
+                       ops::CSendOpASCENDKernel<int8_t>,
+                       ops::CSendOpASCENDKernel<float>,
+                       ops::CSendOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cf01b1d0a6a1d12f98a4715f346fcdcb0bb6ae05
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(send_v2);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(send_v2, NPU);
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = atoi(getenv("DATA_SIZE"));
+
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank id:" << rank_id;
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("DEST_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "send run over";
+  ctx.Wait();
+}
+
+TEST(send_v2, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHcomSendOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..87bb3397ca2672ce377b74682cb0445e31b03677
--- /dev/null
+++ b/paddle/fluid/operators/concat_op_npu.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ConcatNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+    PADDLE_ENFORCE_NOT_NULL(ins[0],
+                            platform::errors::NotFound(
+                                "The first input tensor is not initalized."));
+    auto axis = ctx.Attr<int>("axis");
+
+    if (ctx.HasInput("AxisTensor")) {
+      PADDLE_THROW(platform::errors::NotFound(
+          "The AxisTensor is not supported on NPU now."));
+    }
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    std::vector<framework::Tensor> inputs;
+    std::vector<std::string> names;
+    for (size_t i = 0; i < ins.size(); ++i) {
+      if (ins[i] && ins[i]->numel() > 0) {
+        inputs.push_back(*ins[i]);
+        names.push_back("x" + std::to_string(i));
+      } else {
+        continue;
+      }
+    }
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner = NpuOpRunner(
+        "ConcatD", {inputs}, {*out},
+        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}});
+    runner.AddInputNames(names);
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class ConcatGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
+    auto outs =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+
+    PADDLE_ENFORCE_NOT_NULL(ins[0],
+                            platform::errors::NotFound(
+                                "The first input tensor is not initalized."));
+
+    auto axis = ctx.Attr<int>("axis");
+
+    axis = ComputeAxis(static_cast<int64_t>(axis),
+                       static_cast<int64_t>(ins[0]->dims().size()));
+
+    int offset = 0;
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    for (size_t j = 0; j < outs.size(); ++j) {
+      // For stop gradient
+      // get output tensor that the name is not kEmptyVarName
+      if (out_var_names[j] != framework::kEmptyVarName &&
+          outs[j]->numel() != 0UL) {
+        outs[j]->mutable_data<T>(ctx.GetPlace());
+        std::vector<int> offsets;
+        std::vector<int> sizes;
+        for (int dim = 0; dim < ins[j]->dims().size(); ++dim) {
+          if (dim == axis) {
+            offsets.push_back(offset);
+            sizes.push_back(ins[j]->dims()[dim]);
+          } else {
+            offsets.push_back(0);
+            sizes.push_back(ins[j]->dims()[dim]);
+          }
+        }
+        auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
+                                  {{"offsets", offsets}, {"size", sizes}});
+        runner.Run(stream);
+      }
+      if (ins[j]->numel() != 0UL) {
+        offset += ins[j]->dims()[axis];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel<float>,
+                       ops::ConcatNPUKernel<paddle::platform::float16>,
+                       ops::ConcatNPUKernel<int>);
+
+REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel<float>,
+                       ops::ConcatGradNPUKernel<paddle::platform::float16>,
+                       ops::ConcatGradNPUKernel<int>);
diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc
index aa0002cc6d1777dab6e598fc7c123e5255d0f094..be299babdba7a4f450bafdf5dce8e686f0493fce 100644
--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -132,16 +132,14 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
     axis = ComputeAxis(static_cast<int64_t>(axis),
                        static_cast<int64_t>(ins[0]->dims().size()));
     // get output tensor that the name is not kEmptyVarName
-    std::vector<framework::Tensor*> outputs;
-    std::vector<int> choose_idx;
-    int n = 0;
+    std::vector<T*> ptrs(outs.size());
     for (size_t j = 0; j < outs.size(); ++j) {
       if (out_var_names[j] != framework::kEmptyVarName &&
           outs[j]->numel() != 0UL) {
         outs[j]->mutable_data<T>(ctx.GetPlace());
-        outputs.push_back(outs[j]);
-        choose_idx.push_back(j);
-        n++;
+        ptrs[j] = outs[j]->data<T>();
+      } else {
+        ptrs[j] = nullptr;
       }
     }
     PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument(
@@ -157,10 +155,10 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
             axis, out_grad->dims().size()));
 
     auto input_dims = ins[0]->dims();
-    std::vector<int> split_list(n);
+    std::vector<int> split_list(ins.size());
     std::vector<int> xdims_list(input_dims.size());
     int total_length = 0;
-    for (int i = 0; i < n; ++i) {
+    for (size_t i = 0; i < ins.size(); ++i) {
       split_list[i] = ins[i]->dims()[axis];
       total_length += ins[i]->dims()[axis];
     }
@@ -172,11 +170,6 @@ class ConcatGradXPUKernel : public framework::OpKernel<T> {
     }
     xdims_list[axis] = total_length;
 
-    std::vector<T*> ptrs(n);
-    for (int i = 0; i < n; ++i) {
-      ptrs[i] = outputs[i]->data<T>();
-    }
-
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     int r = xpu::split<T>(dev_ctx.x_context(), out_grad->data<T>(), ptrs,
                           xdims_list, split_list, axis);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 3cad86d96c26a0e25fcbaeb02405315895744e50..bf047de86fc21a4d5d9e9ff8f20c9a1982eb25af 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -23,29 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Functor>
-class CompareOpKernel<platform::CPUDeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    int axis = context.Attr<int>("axis");
-
-    if (x->numel() == 1 && y->numel() == 1) {
-      bool* z_data = z->mutable_data<bool>(context.GetPlace());
-      z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
-    } else {
-      ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
-          context, x, y, axis, Functor(), z);
-    }
-  }
-};
-
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -153,16 +130,22 @@ class CompareOp : public framework::OperatorWithKernel {
   REGISTER_COMPARE_OP_VERSION(op_type);
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
-REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
-REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor,
+                        paddle::operators::GreaterThanFunctor);
 REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
 REGISTER_COMPARE_KERNEL(greater_than, CPU,
-                        paddle::operators::GreaterThanFunctor);
+                        paddle::operators::GreaterThanFunctor,
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
 REGISTER_COMPARE_KERNEL(greater_equal, CPU,
-                        paddle::operators::GreaterEqualFunctor);
+                        paddle::operators::GreaterEqualFunctor,
+                        paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_OP(equal, "Out = X == Y");
-REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor,
+                        paddle::operators::EqualFunctor);
 REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
-REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor,
+                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
index b1f306358359764b919f9e570cf44f9733a7d178..3ca700e16e6e7bcf4136ca68dd895593a63824ec 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
@@ -14,11 +14,17 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 
-REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
-REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
                         paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+                        paddle::operators::GreaterThanFunctor,
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
-REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
+                        paddle::operators::GreaterEqualFunctor,
+                        paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor,
+                        paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor,
+                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
index b7529e4ae632d31524846d9d5aa4b1883f4509a1..ff929ee7dfce79536a9ce7c8ae6878fb7e3871e9 100644
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ b/paddle/fluid/operators/controlflow/compare_op.h
@@ -68,7 +68,7 @@ struct NotEqualFunctor {
   }
 };
 
-template <typename DeviceContext, typename Functor>
+template <typename DeviceContext, typename Functor, typename InverseFunctor>
 class CompareOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -80,21 +80,33 @@ class CompareOpKernel
     auto* y = context.Input<Tensor>("Y");
     auto* z = context.Output<Tensor>("Out");
     int axis = context.Attr<int>("axis");
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                          Functor(), z);
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    if (x_dims.size() >= y_dims.size()) {
+      ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
+                                                            Functor(), z);
+    } else {
+      ElementwiseComputeEx<InverseFunctor, DeviceContext, T, bool>(
+          context, x, y, axis, InverseFunctor(), z);
+    }
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_COMPARE_KERNEL(op_type, dev, functor)                    \
-  REGISTER_OP_##dev##_KERNEL(                                             \
-      op_type, ::paddle::operators::CompareOpKernel<                      \
-                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
+#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor)       \
+  REGISTER_OP_##dev##_KERNEL(op_type,                                         \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<int>, inverse_functor<int>>,         \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<int64_t>, inverse_functor<int64_t>>, \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<float>, inverse_functor<float>>,     \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<double>, inverse_functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..591fb55936734ffc675dad5c6912e7cbf4e80471
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class EqualNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessThanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    // int axis = context.Attr<int>("axis");
+    z->mutable_data<bool>(ctx.GetPlace());  // allocate
+    auto runner = NpuOpRunner("Less", {*x, *y}, {*z});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(equal, ops::EqualNPUKernel<float>,
+                       ops::EqualNPUKernel<plat::float16>,
+                       ops::EqualNPUKernel<int>);
+
+REGISTER_OP_NPU_KERNEL(
+    less_than,
+    ops::LessThanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LessThanNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index b9ea2ade6cb90b71d423ba977215ab693f19b562..6513bae839e9894ee2b342c5d61fcbf9191a4123 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -78,6 +78,13 @@ class ConditionalOp : public framework::OperatorBase {
       framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
       platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
       res = cpu_tensor.data<bool>()[0];
+#endif
+    } else if (platform::is_npu_place(ips[0]->place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      framework::LoDTensor cpu_tensor;
+      framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
+      platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
+      res = cpu_tensor.data<bool>()[0];
 #endif
     } else {
       res = ips[0]->data<bool>()[0];
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
index 22eb2ece4b05b8ad7fad3acdc545e3c98d211f31..7ce63aa9cbbfaaa4adb7834dd33e24cb6491a7a9 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
+#include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index d86b6b48422d94604724303de72f401bfba2e23e..fdd1b776bd8fa3f24fb596af29512f1f781dce4c 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -44,6 +44,11 @@ static void DataCopy(const framework::LoDTensor &src_item,
       TensorCopySync(src_item, platform::CPUPlace(), dst_item);
     }
 #else
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(src_item.place())) {
+      platform::DeviceContextPool::Instance().Get(src_item.place())->Wait();
+    }
+#endif
     TensorCopySync(src_item, platform::CPUPlace(), dst_item);
 #endif
   } else {
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1b0c0e444347af0a90f8244590b84199dc97f931
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/controlflow/logical_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class LogicalNotNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    logical_not,
+    ops::LogicalNotNPUKernel<paddle::platform::NPUDeviceContext, bool>);
+
+#endif
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 110bb69a14083ebbe26c1e26e6cf585b70f6b825..7fdb1ccfe9614fc0b30c7e13f564ece217c08b36 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -249,6 +249,7 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
     args.handle = handle;
 
 #ifdef PADDLE_WITH_HIP
+    // MIOPEN need to set groups in cdesc in miopen_desc.h
     args.cdesc.set(dtype, padding_common, strides, dilations,
                    platform::AllowTF32Cudnn(), groups);
 #else
@@ -264,6 +265,10 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
         platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(),
                                                          groups));
     groups = 1;
+#endif
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN do not set groups in wdesc after set groups in cdesc
+    groups = 1;
 #endif
     args.idesc.set(transformed_input, layout_format);
     args.wdesc.set(transformed_filter_channel, layout_format, groups);
@@ -292,12 +297,14 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
     miopenConvFwdAlgorithm_t algo{};
     using search = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = search::GetWorkspaceSize(args);
+    algo = search::Find<T>(args, exhaustive_search, false, workspace_size, ctx);
 #else
     cudnnConvolutionFwdAlgo_t algo{};
     using search = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
     algo = search::Find<T>(args, exhaustive_search, false, ctx);
     workspace_size = search::GetWorkspaceSize(args, algo);
+#endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
     // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\
@@ -652,13 +659,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
       using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search1::GetWorkspaceSize(args1));
+      data_algo = search1::Find<T>(args1, exhaustive_search, deterministic,
+                                   workspace_size, ctx);
 #else
       using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
       data_algo =
           search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
       workspace_size =
           std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
+#endif
     }
 
     if (filter_grad) {
@@ -673,38 +684,68 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       platform::AllowTF32Cudnn(), c_groups);
 #ifdef PADDLE_WITH_HIP
       using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2));
+      filter_algo = search2::Find<T>(args2, exhaustive_search, deterministic,
+                                     workspace_size, ctx);
 #else
       using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-#endif
       filter_algo =
           search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
       workspace_size = std::max(workspace_size,
                                 search2::GetWorkspaceSize(args2, filter_algo));
+#endif
     }
 
     // ------------------- cudnn conv backward data ---------------------
     ScalingParamType<T> alpha = 1.0f;
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN ONLY support beta to be 0.0f
+    ScalingParamType<T> beta = 0.0f;
+#else
     ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+#endif
     VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
 
     if (input_grad) {
-      // When beta is 0, it is unnecessary to reset input_grad.
-      // When beta is 1, the output cannot be reset since addt strategy used.
-      for (int i = 0; i < groups; i++) {
+// When beta is 0, it is unnecessary to reset input_grad.
+// When beta is 1, the output cannot be reset since addt strategy used.
 #ifdef PADDLE_WITH_HIP
+      if (ctx.Attr<bool>("use_addto")) {
+        Tensor temp_tensor(transformed_input_grad.type());
+        temp_tensor.Resize(transformed_input_grad.dims());
+        T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
               PADDLE_ENFORCE_CUDA_SUCCESS(
                   platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args1.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args1.wdesc.desc(), filter_data + i * group_offset_filter,
-                      args1.cdesc.desc(), data_algo, &beta, args1.idesc.desc(),
-                      transformed_input_grad_data + i * group_offset_in,
+                      handle, &alpha, args1.odesc.desc(), output_grad_data,
+                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
+                      data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
                       cudnn_workspace_ptr, workspace_size));
             },
             workspace_size);
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+            handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
+            transformed_input_grad_data, &alpha, args1.idesc.desc(),
+            temp_tensor_data, &beta, args1.idesc.desc(),
+            transformed_input_grad_data));
+      } else {
+        workspace_handle.RunFunc(
+            [&](void* cudnn_workspace_ptr) {
+              PADDLE_ENFORCE_CUDA_SUCCESS(
+                  platform::dynload::miopenConvolutionBackwardData(
+                      handle, &alpha, args1.odesc.desc(), output_grad_data,
+                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
+                      data_algo, &beta, args1.idesc.desc(),
+                      transformed_input_grad_data, cudnn_workspace_ptr,
+                      workspace_size));
+            },
+            workspace_size);
+      }
+
 #else
+      for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
               PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -717,9 +758,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       transformed_input_grad_data + i * group_offset_in));
             },
             workspace_size);
-#endif
       }
-
+#endif
       if (!is_sys_pad) {
         std::vector<int> starts(transformed_input_channel.dims().size(), 0);
         std::vector<int> axes(transformed_input_channel.dims().size(), 0);
@@ -751,23 +791,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
     ScalingParamType<T> beta_filter = 0.0f;
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
-      // Because beta is zero, it is unnecessary to reset filter_grad.
-      for (int i = 0; i < groups; i++) {
+// Because beta is zero, it is unnecessary to reset filter_grad.
 #ifdef PADDLE_WITH_HIP
-        workspace_handle.RunFunc(
-            [&](void* cudnn_workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardWeights(
-                      handle, &alpha, args2.odesc.desc(),
-                      output_grad_data + i * group_offset_out,
-                      args2.idesc.desc(), input_data + i * group_offset_in,
-                      args2.cdesc.desc(), filter_algo, &beta,
-                      args2.wdesc.desc(),
-                      filter_grad_data + i * group_offset_filter,
-                      cudnn_workspace_ptr, workspace_size));
-            },
-            workspace_size);
+      workspace_handle.RunFunc(
+          [&](void* cudnn_workspace_ptr) {
+            PADDLE_ENFORCE_CUDA_SUCCESS(
+                platform::dynload::miopenConvolutionBackwardWeights(
+                    handle, &alpha, args2.odesc.desc(), output_grad_data,
+                    args2.idesc.desc(), input_data, args2.cdesc.desc(),
+                    filter_algo, &beta, args2.wdesc.desc(), filter_grad_data,
+                    cudnn_workspace_ptr, workspace_size));
+          },
+          workspace_size);
 #else
+      for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
             [&](void* cudnn_workspace_ptr) {
               PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -780,8 +817,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                       filter_grad_data + i * group_offset_filter));
             },
             workspace_size);
-#endif
       }
+#endif
 
       if (compute_format == DataLayout::kNHWC) {
         TransToChannelFirst<paddle::platform::CUDADeviceContext, T>(
@@ -1080,32 +1117,37 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
         using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+        workspace_size = search1::GetWorkspaceSize(args1);
+        fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false,
+                                     workspace_size, ctx);
 #else
         using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
         fwd_algo1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
         workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1);
+#endif
       }
 
       if (ddW) {
         ddw = ddW->data<T>();
         args2.handle = handle;
         args2.idesc.set(transformed_X, iwo_group);
-
         args2.wdesc.set(*ddW, layout, iwo_group);
-
         args2.odesc.set(transformed_ddO_channel, iwo_group);
         args2.cdesc.set(dtype, padding_common, strides, dilations,
                         platform::AllowTF32Cudnn(), c_group);
 
 #ifdef PADDLE_WITH_HIP
         using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+        workspace_size =
+            std::max(workspace_size, search2::GetWorkspaceSize(args2));
+        fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false,
+                                     workspace_size, ctx);
 #else
         using search2 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
         fwd_algo2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
         workspace_size = std::max(workspace_size,
                                   search2::GetWorkspaceSize(args2, fwd_algo2));
+#endif
       }
     }
 
@@ -1114,21 +1156,23 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
       args3.handle = handle;
       args3.idesc.set(transformed_ddX, iwo_group);
       args3.wdesc.set(*dW, layout, iwo_group);
-
       args3.odesc.set(transformed_dO_channel, iwo_group);
-
       args3.cdesc.set(dtype, padding_common, strides, dilations,
                       platform::AllowTF32Cudnn(), c_group);
 
 #ifdef PADDLE_WITH_HIP
       using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search3::GetWorkspaceSize(args3));
+      filter_algo = search3::Find<T>(args3, exhaustive_search, deterministic,
+                                     workspace_size, ctx);
 #else
       using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-#endif
       filter_algo =
           search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
       workspace_size = std::max(workspace_size,
                                 search3::GetWorkspaceSize(args3, filter_algo));
+#endif
     }
 
     if (ddW && dX) {
@@ -1143,13 +1187,17 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
       using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search4::GetWorkspaceSize(args4));
+      data_algo = search4::Find<T>(args4, exhaustive_search, deterministic,
+                                   workspace_size, ctx);
 #else
       using search4 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
       data_algo =
           search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
       workspace_size =
           std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
     }
 
     int i_n, i_c, i_d, i_h, i_w;
@@ -1176,21 +1224,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     if (ddO) {
       if (ddX) {
         ddx = transformed_ddX.data<T>();
-        for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
-                    platform::dynload::miopenConvolutionForward(
-                        handle, &alpha, args1.idesc.desc(),
-                        ddx + i * group_offset_in, args1.wdesc.desc(),
-                        w + i * group_offset_filter, args1.cdesc.desc(),
-                        fwd_algo1, &beta, args1.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out,
-                        workspace_ptr, workspace_size));
-              },
-              workspace_size);
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_CUDA_SUCCESS(
+                  platform::dynload::miopenConvolutionForward(
+                      handle, &alpha, args1.idesc.desc(), ddx,
+                      args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1,
+                      &beta, args1.odesc.desc(), transformed_ddy_channel,
+                      workspace_ptr, workspace_size));
+            },
+            workspace_size);
 #else
+        for (int i = 0; i < groups; i++) {
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
                 PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1203,26 +1249,24 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                         transformed_ddy_channel + i * group_offset_out));
               },
               workspace_size);
-#endif
         }
+#endif
       }
       if (ddW) {
-        for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-          // MIOPEN ONLY support beta to be 0.0f
-          wkspace_handle.RunFunc(
-              [&](void* workspace_ptr) {
-                PADDLE_ENFORCE_CUDA_SUCCESS(
-                    platform::dynload::miopenConvolutionForward(
-                        handle, &alpha, args2.idesc.desc(),
-                        x + i * group_offset_in, args2.wdesc.desc(),
-                        ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        fwd_algo2, &beta, args2.odesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out,
-                        workspace_ptr, workspace_size));
-              },
-              workspace_size);
+        // MIOPEN ONLY support beta to be 0.0f
+        wkspace_handle.RunFunc(
+            [&](void* workspace_ptr) {
+              PADDLE_ENFORCE_CUDA_SUCCESS(
+                  platform::dynload::miopenConvolutionForward(
+                      handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(),
+                      ddw, args2.cdesc.desc(), fwd_algo2, &beta,
+                      args2.odesc.desc(), transformed_ddy_channel,
+                      workspace_ptr, workspace_size));
+            },
+            workspace_size);
 #else
+        for (int i = 0; i < groups; i++) {
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
                 PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1235,8 +1279,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                         transformed_ddy_channel + i * group_offset_out));
               },
               workspace_size);
-#endif
         }
+#endif
       }
       if (channel_last) {
         TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
@@ -1246,21 +1290,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     T* transformed_dy_channel = transformed_dO_channel.data<T>();
     if (dW && ddX) {
       ddx = transformed_ddX.data<T>();
-      for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardWeights(
-                      handle, &alpha, args3.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args3.idesc.desc(), ddx + i * group_offset_in,
-                      args3.cdesc.desc(), filter_algo, &beta,
-                      args3.wdesc.desc(), dw + i * group_offset_filter,
-                      workspace_ptr, workspace_size));
-            },
-            workspace_size);
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_CUDA_SUCCESS(
+                platform::dynload::miopenConvolutionBackwardWeights(
+                    handle, &alpha, args3.odesc.desc(), transformed_dy_channel,
+                    args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo,
+                    &beta, args3.wdesc.desc(), dw, workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
 #else
+      for (int i = 0; i < groups; i++) {
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
               PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1273,27 +1315,25 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                       dw + i * group_offset_filter));
             },
             workspace_size);
-#endif
       }
+#endif
     }
 
     if (dX && ddW) {
       ddw = ddW->data<T>();
-      for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
-        wkspace_handle.RunFunc(
-            [&](void* workspace_ptr) {
-              PADDLE_ENFORCE_CUDA_SUCCESS(
-                  platform::dynload::miopenConvolutionBackwardData(
-                      handle, &alpha, args4.odesc.desc(),
-                      transformed_dy_channel + i * group_offset_out,
-                      args4.wdesc.desc(), ddw + i * group_offset_filter,
-                      args4.cdesc.desc(), data_algo, &beta, args4.idesc.desc(),
-                      transformed_dx + i * group_offset_in, workspace_ptr,
-                      workspace_size));
-            },
-            workspace_size);
+      wkspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_CUDA_SUCCESS(
+                platform::dynload::miopenConvolutionBackwardData(
+                    handle, &alpha, args4.odesc.desc(), transformed_dy_channel,
+                    args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo,
+                    &beta, args4.idesc.desc(), transformed_dx, workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
 #else
+      for (int i = 0; i < groups; i++) {
         wkspace_handle.RunFunc(
             [&](void* workspace_ptr) {
               PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1306,8 +1346,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
                       transformed_dx + i * group_offset_in));
             },
             workspace_size);
-#endif
       }
+#endif
 
       if (!is_sys_pad) {
         // reverse padded input
@@ -1350,7 +1390,14 @@ REGISTER_OP_KERNEL(
     conv2d_grad_grad, CUDNN, plat::CUDAPlace,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-
+// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
+// Use depthwise_conv2d in MIOPEN to resolve this issue
+REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
+REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     depthwise_conv2d_grad_grad,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h
index ddddb7f8641ba1276dff4b01a1caaa6176f97714..23a471cfa006746c5762fa7169248cbae60a7899 100644
--- a/paddle/fluid/operators/conv_cudnn_op_cache.h
+++ b/paddle/fluid/operators/conv_cudnn_op_cache.h
@@ -40,11 +40,6 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS =
     CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT;
 static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS =
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT;
-#else
-// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc.
-static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7;
-static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4;
-static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5;
 #endif
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index 44ead95a355a25e148fb2d6fcd10a552b92b7f86..befe09c8e6beb3d911521e4ff78f3427a3b0dd78 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -127,57 +127,32 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
+                     bool deterministic, size_t workspace_size,
                      const framework::ExecutionContext& ctx) {
-    auto dtype = platform::CudnnDataType<T>::type;
-    bool has_got_workspace_size = true;
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
     algo_t algo;
 
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
-    auto& temp = ctx.cuda_device_context();
-    AlgorithmsCache<algo_t>& algo_cache =
-        *(framework::ConvSearchCache::Instance().GetForward());
-
-    auto x_dims = framework::vectorize(args.x->dims());
-    auto w_dims = framework::vectorize(args.w->dims());
-
-    VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
-             << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-             << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-    algo = algo_cache.GetAlgorithm(
-        x_dims, w_dims, args.s, args.p, args.d, 0,
-        static_cast<int64_t>(args.cudnn_dtype), [&]() {
-          int returned_algo_count;
-          std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
-
-          auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
-                platform::dynload::miopenFindConvolutionForwardAlgorithm(
-                    args.handle, args.idesc.desc(), args.x->data<T>(),
-                    args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
-                    args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
-                    kNUM_CUDNN_FWD_ALGS, &returned_algo_count, perf_stat.data(),
-                    cudnn_workspace_ptr, workspace_size_limit, false));
-          };
-          workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-
-          VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)";
-          for (int i = 0; i < returned_algo_count; ++i) {
-            const auto& stat = perf_stat[i];
-            VLOG(3) << stat.fwd_algo;
-          }
-          return perf_stat[0].fwd_algo;
-        });
+    int find_count;
+    miopenConvAlgoPerf_t find_result;
+    auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::miopenFindConvolutionForwardAlgorithm(
+              args.handle, args.idesc.desc(), args.x->data<T>(),
+              args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
+              args.odesc.desc(), const_cast<T*>(args.o->data<T>()),
+              kNUM_CUDNN_FWD_ALGS, &find_count, &find_result,
+              cudnn_workspace_ptr, workspace_size, false));
+    };
+
+    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    algo = find_result.fwd_algo;
     VLOG(3) << "choose algo " << algo;
     return algo;
   }
 
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+  static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
@@ -194,58 +169,32 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
+                     bool deterministic, size_t workspace_size,
                      const framework::ExecutionContext& ctx) {
-    auto dtype = platform::CudnnDataType<T>::type;
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    bool has_got_workspace_size = true;
     algo_t algo;
 
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto workspace_handle = dev_ctx.cudnn_workspace_handle();
 
-    AlgorithmsCache<algo_t>& algo_cache =
-        *(framework::ConvSearchCache::Instance().GetBackwardData());
-
-    auto x_dims = framework::vectorize(args.x->dims());
-    auto w_dims = framework::vectorize(args.w->dims());
-
-    VLOG(10) << "miopenConvolutionFwdAlgoPerf_t"
-             << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-             << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-    algo = algo_cache.GetAlgorithm(
-        x_dims, w_dims, args.s, args.p, args.d, 0,
-        static_cast<int64_t>(args.cudnn_dtype), [&]() {
-          int returned_algo_count;
-          std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
-
-          auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
-                platform::dynload::miopenFindConvolutionBackwardDataAlgorithm(
-                    args.handle, args.odesc.desc(), args.o->data<T>(),
-                    args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
-                    args.idesc.desc(), const_cast<T*>(args.x->data<T>()),
-                    kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count,
-                    perf_stat.data(), cudnn_workspace_ptr, workspace_size_limit,
-                    false));
-          };
-          workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-
-          VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)";
-          for (int i = 0; i < returned_algo_count; ++i) {
-            const auto& stat = perf_stat[i];
-            VLOG(3) << stat.bwd_data_algo;
-          }
-
-          return perf_stat[0].bwd_data_algo;
-        });
+    int find_count;
+    miopenConvAlgoPerf_t find_result;
+    auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::miopenFindConvolutionBackwardDataAlgorithm(
+              args.handle, args.odesc.desc(), args.o->data<T>(),
+              args.wdesc.desc(), args.w->data<T>(), args.cdesc.desc(),
+              args.idesc.desc(), const_cast<T*>(args.x->data<T>()),
+              kNUM_CUDNN_BWD_DATA_ALGS, &find_count, &find_result,
+              cudnn_workspace_ptr, workspace_size, false));
+    };
+
+    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    algo = find_result.bwd_data_algo;
     VLOG(3) << "choose algo " << algo;
     return algo;
   }
 
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+  static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize(
@@ -262,56 +211,32 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
 
   template <typename T>
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
-                     bool deterministic,
+                     bool deterministic, size_t workspace_size,
                      const framework::ExecutionContext& ctx) {
-    auto dtype = platform::CudnnDataType<T>::type;
-    size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
-    size_t workspace_size = 0;
-    bool has_got_workspace_size = true;
     algo_t algo;
 
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-    AlgorithmsCache<algo_t>& algo_cache =
-        *(framework::ConvSearchCache::Instance().GetBackwardFilter());
-
-    auto x_dims = framework::vectorize(args.x->dims());
-    auto w_dims = framework::vectorize(args.w->dims());
-
-    VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
-             << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-             << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-    algo = algo_cache.GetAlgorithm(
-        x_dims, w_dims, args.s, args.p, args.d, 0,
-        static_cast<int64_t>(args.cudnn_dtype), [&]() {
-          int returned_algo_count;
-          std::array<perf_t, kNUM_CUDNN_FWD_ALGS> perf_stat;
-          auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
-                platform::dynload::
-                    miopenFindConvolutionBackwardWeightsAlgorithm(
-                        args.handle, args.odesc.desc(), args.o->data<T>(),
-                        args.idesc.desc(), args.x->data<T>(), args.cdesc.desc(),
-                        args.wdesc.desc(), const_cast<T*>(args.w->data<T>()),
-                        kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count,
-                        perf_stat.data(), cudnn_workspace_ptr,
-                        workspace_size_limit, false));
-          };
-          workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit);
-
-          VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)";
-          for (int i = 0; i < returned_algo_count; ++i) {
-            const auto& stat = perf_stat[i];
-            VLOG(3) << stat.bwd_weights_algo;
-          }
-          return perf_stat[0].bwd_weights_algo;
-        });
+
+    int find_count;
+    miopenConvAlgoPerf_t find_result;
+    auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm(
+              args.handle, args.odesc.desc(), args.o->data<T>(),
+              args.idesc.desc(), args.x->data<T>(), args.cdesc.desc(),
+              args.wdesc.desc(), const_cast<T*>(args.w->data<T>()),
+              kNUM_CUDNN_BWD_FILTER_ALGS, &find_count, &find_result,
+              cudnn_workspace_ptr, workspace_size, false));
+    };
+
+    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    algo = find_result.bwd_weights_algo;
     VLOG(3) << "choose algo " << algo;
     return algo;
   }
 
-  static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+  static size_t GetWorkspaceSize(const ConvArgs& args) {
     size_t workspace_size = 0;
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize(
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 364e3ab8d26c3f35f41f319b3d31b63964b93abe..94d1f707b74c2eae17d02771ad7d548e8b908dd9 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -903,29 +903,19 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
               "and input channel number is %d",
               output->dims()[1], input->dims()[1]));
     }
-    // transform tensor
-    Tensor transformed_input(input->type());
-    Tensor transformed_output(output->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output,
-                                             &transformed_output);
-
-    } else {
-      transformed_input = *input;
-      transformed_output = *output;
-    }
 
     // update padding and dilation
-    auto in_dims = transformed_input.dims();
+    auto in_dims = input->dims();
     auto filter_dims = filter.dims();
 
     framework::DDim in_data_dims;
-    in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_format);
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
 
     framework::DDim filter_data_dims =
         framework::slice_ddim(filter_dims, 2, filter_dims.size());
@@ -944,16 +934,12 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
 
     if (fuse_relu) {
       math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
-      depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings,
-                    dilations, &transformed_output);
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output, data_layout);
     } else {
       math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
-      depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings,
-                    dilations, &transformed_output);
-    }
-    if (channel_last) {
-      TransToChannelLast<DeviceContext, T>(context, &transformed_output,
-                                           output);
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output, data_layout);
     }
   }
 };
@@ -981,33 +967,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
         context.Attr<std::string>("padding_algorithm");
     const std::string data_format = context.Attr<std::string>("data_format");
 
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensor
-    Tensor transformed_input(input->type());
-    Tensor transformed_output_grad(output_grad->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output_grad,
-                                             &transformed_output_grad);
-      TransToChannelFirst<DeviceContext, T>(context, output_grad,
-                                            &transformed_output_grad);
-
-    } else {
-      transformed_input = *input;
-      transformed_output_grad = *output_grad;
-    }
-
     // update padding and dilation
-    auto in_dims = transformed_input.dims();
+    auto in_dims = input->dims();
     auto filter_dims = filter.dims();
 
     framework::DDim in_data_dims;
-    in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_format);
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
     framework::DDim filter_data_dims =
         framework::slice_ddim(filter_dims, 2, filter_dims.size());
     std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
@@ -1025,33 +996,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
 
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      Tensor transformed_input_grad(input_grad->type());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(context, input_grad,
-                                               &transformed_input_grad);
-
-      } else {
-        transformed_input_grad = *input_grad;
-      }
-
-      set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
 
       if (fuse_relu) {
         math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
             depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, transformed_input, filter,
-                               transformed_output_grad, strides, paddings,
-                               dilations, &transformed_input_grad);
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad, data_layout);
       } else {
         math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
             depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, transformed_input, filter,
-                               transformed_output_grad, strides, paddings,
-                               dilations, &transformed_input_grad);
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(context, &transformed_input_grad,
-                                             input_grad);
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad, data_layout);
       }
     }
 
@@ -1061,15 +1017,13 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
       if (fuse_relu) {
         math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
             depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, transformed_input,
-                                transformed_output_grad, strides, paddings,
-                                dilations, filter_grad);
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad, data_layout);
       } else {
         math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
             depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, transformed_input,
-                                transformed_output_grad, strides, paddings,
-                                dilations, filter_grad);
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad, data_layout);
       }
     }
   }
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 376cefe50258bf788404ddf63854f2ac84f327e5..c4cd5854c0f78ae970ee953f9cb46f5c1b840630 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -202,7 +202,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 
     int iwo_groups = groups;
     int c_groups = 1;
-#if CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
     iwo_groups = 1;
     c_groups = groups;
     groups = 1;
@@ -244,13 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
     using search = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args));
+    algo = search::Find<T>(args, false, deterministic, workspace_size, ctx);
 #else
     using search = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
-
     algo = search::Find<T>(args, false, deterministic, ctx);
     workspace_size =
         std::max(workspace_size, search::GetWorkspaceSize(args, algo));
+#endif
 
     // ------------------- cudnn conv transpose forward ---------------------
     int input_offset =
@@ -451,7 +452,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
 
     int iwo_groups = groups;
     int c_groups = 1;
-#if CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
     iwo_groups = 1;
     c_groups = groups;
     groups = 1;
@@ -489,10 +490,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
     bool deterministic = FLAGS_cudnn_deterministic;
     T* input_grad_data = nullptr;
     T* filter_grad_data = nullptr;
-    if (input_grad)
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-    if (filter_grad)
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
 
     if (input_grad) {
       input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
@@ -504,12 +501,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
                       platform::AllowTF32Cudnn(), c_groups);
 #ifdef PADDLE_WITH_HIP
       using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search1::GetWorkspaceSize(args1));
+      data_algo =
+          search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
 #else
       using search1 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
       data_algo = search1::Find<T>(args1, false, deterministic, ctx);
       workspace_size =
           std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo));
+#endif
     }
 
     if (filter_grad) {
@@ -522,12 +523,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
                       platform::AllowTF32Cudnn(), c_groups);
 #ifdef PADDLE_WITH_HIP
       using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search2::GetWorkspaceSize(args2));
+      filter_algo =
+          search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
 #else
       using search2 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-#endif
       filter_algo = search2::Find<T>(args2, false, deterministic, ctx);
       workspace_size = std::max(workspace_size,
                                 search2::GetWorkspaceSize(args2, filter_algo));
+#endif
     }
 
     // ------------------- cudnn conv backward data ---------------------
@@ -875,7 +880,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 
     int iwo_group = groups;
     int c_group = 1;
-#if CUDNN_VERSION_MIN(7, 0, 1)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
     iwo_group = 1;
     c_group = groups;
     groups = 1;
@@ -939,14 +944,18 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
         args1.idesc.set(transformed_ddO_channel, iwo_group);
         args1.wdesc.set(*W, layout, iwo_group);
         args1.odesc.set(transformed_ddX, iwo_group);
-        args1.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+        args1.cdesc.set(dtype, padding_common, strides, dilations,
+                        platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
         using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+        workspace_size = search1::GetWorkspaceSize(args1);
+        bwd_algo1 =
+            search1::Find<T>(args1, false, deterministic, workspace_size, ctx);
 #else
         using search1 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
         bwd_algo1 = search1::Find<T>(args1, false, deterministic, ctx);
         workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1);
+#endif
       }
 
       if (ddW) {
@@ -955,15 +964,20 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
         args2.idesc.set(transformed_ddO_channel, iwo_group);
         args2.wdesc.set(*ddW, layout, iwo_group);
         args2.odesc.set(transformed_X, iwo_group);
-        args2.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+        args2.cdesc.set(dtype, padding_common, strides, dilations,
+                        platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
         using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+        workspace_size =
+            std::max(workspace_size, search2::GetWorkspaceSize(args2));
+        bwd_algo2 =
+            search2::Find<T>(args2, false, deterministic, workspace_size, ctx);
 #else
         using search2 = SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-#endif
         bwd_algo2 = search2::Find<T>(args2, false, deterministic, ctx);
         workspace_size = std::max(workspace_size,
                                   search2::GetWorkspaceSize(args2, bwd_algo2));
+#endif
       }
     }
 
@@ -975,15 +989,20 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 
       args3.odesc.set(transformed_ddX_channel, iwo_group);
 
-      args3.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+      args3.cdesc.set(dtype, padding_common, strides, dilations,
+                      platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
       using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search3::GetWorkspaceSize(args3));
+      filter_algo =
+          search3::Find<T>(args3, false, deterministic, workspace_size, ctx);
 #else
       using search3 = SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-#endif
       filter_algo = search3::Find<T>(args3, false, deterministic, ctx);
       workspace_size = std::max(workspace_size,
                                 search3::GetWorkspaceSize(args3, filter_algo));
+#endif
     }
 
     if (ddW && dX) {
@@ -993,15 +1012,20 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       args4.idesc.set(transformed_dO, iwo_group);
       args4.wdesc.set(*ddW, layout, iwo_group);
       args4.odesc.set(transformed_dX_channel, iwo_group);
-      args4.cdesc.set(dtype, padding_common, strides, dilations, c_group);
+      args4.cdesc.set(dtype, padding_common, strides, dilations,
+                      platform::AllowTF32Cudnn(), c_group);
 #ifdef PADDLE_WITH_HIP
       using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+      workspace_size =
+          std::max(workspace_size, search4::GetWorkspaceSize(args4));
+      data_algo =
+          search4::Find<T>(args4, false, deterministic, workspace_size, ctx);
 #else
       using search4 = SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-#endif
       data_algo = search4::Find<T>(args4, false, deterministic, ctx);
       workspace_size =
           std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo));
+#endif
     }
 
     int i_n, i_c, i_d, i_h, i_w;
@@ -1059,6 +1083,10 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
       if (ddW) {
         for (int i = 0; i < groups; i++) {
 #ifdef PADDLE_WITH_HIP
+          // MIOPEN ONLY support beta to be 0.0f
+          Tensor conv_x_ddw(dO->type());
+          conv_x_ddw.Resize(transformed_ddO_channel.dims());
+          T* conv_x_ddw_data = conv_x_ddw.mutable_data<T>(ctx.GetPlace());
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
                 PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -1066,11 +1094,17 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
                         handle, &alpha, args2.odesc.desc(),
                         x + i * group_offset_in, args2.wdesc.desc(),
                         ddw + i * group_offset_filter, args2.cdesc.desc(),
-                        bwd_algo2, &alpha, args2.idesc.desc(),
-                        transformed_ddy_channel + i * group_offset_out,
-                        workspace_ptr, workspace_size));
+                        bwd_algo2, &beta, args2.idesc.desc(),
+                        conv_x_ddw_data + i * group_offset_out, workspace_ptr,
+                        workspace_size));
               },
               workspace_size);
+          PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+              handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(),
+              transformed_ddy_channel + i * group_offset_out, &alpha,
+              args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta,
+              args2.idesc.desc(),
+              transformed_ddy_channel + i * group_offset_out));
 #else   // PADDLE_WITH_HIP
           wkspace_handle.RunFunc(
               [&](void* workspace_ptr) {
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index 651719f1052806ad356f2bc8fd4c2f3a0abe210b..ecf5b6d774a2605c06bbeb2514c981b46e7f6a0d 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -682,9 +682,9 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
     if (input_grad) {
       math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
       depthwiseConv(
-          dev_ctx, *output_grad, filter, strides, paddings,
+          dev_ctx, *output_grad, filter, strides,
           std::vector<int>{paddings[0], paddings[2], paddings[1], paddings[3]},
-          input_grad, data_layout);
+          dilations, input_grad, data_layout);
     }
 
     if (filter_grad) {
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..721354954c70355e18d330ea458101a23d9cb401
--- /dev/null
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+using LoDTensor = paddle::framework::LoDTensor;
+using Tensor = paddle::framework::Tensor;
+
+namespace paddle {
+namespace operators {
+
+class CopyCrossScopeOp : public framework::OperatorBase {
+ public:
+  CopyCrossScopeOp(const std::string& type,
+                   const framework::VariableNameMap& inputs,
+                   const framework::VariableNameMap& outputs,
+                   const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const {}
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int num_micro_scopes = scope.kids().size();
+    int num_micro_batches = Attr<int>("num_micro_batches");
+    bool ToM = Attr<bool>("to_main_scope");
+    PADDLE_ENFORCE_EQ(num_micro_scopes, num_micro_batches,
+                      platform::errors::InvalidArgument(
+                          "For pipeline, number of micro scopes (%d) should "
+                          "be equal to number of micro batches (%d).",
+                          num_micro_scopes, num_micro_batches));
+    const std::string& id_name = Input("Id");
+    auto* id_var = scope.FindVar(id_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        id_var,
+        platform::errors::NotFound("No variable with name %s found.", id_name));
+    auto id_tensor = id_var->GetMutable<LoDTensor>();
+    auto it = scope.kids().begin();
+    framework::Tensor cpu_id_tensor;
+    TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor);
+    auto id_value = cpu_id_tensor.data<int64_t>();
+    for (auto i = 0; i < *id_value; i++) {
+      it++;
+    }
+    if (it == scope.kids().end()) {
+      if (ToM) {
+        auto dst_scope = *it;
+        const std::string& x_name = Input("X");
+        auto* dst_var = dst_scope->FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            dst_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in source scope.", x_name));
+        auto* main_var = scope.FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            main_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in destination scope.",
+                x_name));
+        auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+        auto main_tensor = main_var->GetMutable<LoDTensor>();
+        TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+      }
+      return;
+    }
+    auto source_scope = *it;
+    it++;
+    auto dst_scope = *it;
+    const std::string& x_name = Input("X");
+    auto* source_var = source_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        source_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in source scope.", x_name));
+    auto* dst_var = dst_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        dst_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in destination scope.", x_name));
+    auto src_tensor = source_var->GetMutable<LoDTensor>();
+    auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+    TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor);
+
+    if (ToM) {
+      auto* main_var = scope.FindVar(x_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          main_var,
+          platform::errors::NotFound(
+              "No variable with name %s found in destination scope.", x_name));
+      auto main_tensor = main_var->GetMutable<LoDTensor>();
+      TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+    }
+  }
+};
+
+class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), The first input tensor of copy_cross_scope op, which "
+             "is copying micro scope.");
+    AddInput("Id",
+             "(Tensor), The second input tensor of copy_cross_scope op, which "
+             "is a id of the current micro scope.");
+    AddAttr<bool>("to_main_scope", "Return current scope to main scope.")
+        .SetDefault(false);
+    AddAttr<int>("num_micro_batches", "Number of micro batches for pipeline.");
+    AddComment(R"DOC(
+      This op is used by pipeline to copy tensors across micro batch scopes. 
+      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. 
+      If need to copy back to the main scope, using to_main_scope option to copy the variable value of 
+      the current micro scope to the main scope.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(copy_cross_scope, ops::CopyCrossScopeOp,
+                             ops::CopyCrossScopeOpMaker);
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e175b235f9c1816df8d0b4bb4cc0740778288cef
--- /dev/null
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <iostream>
+#include <list>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/copy_cross_scope_op.cc"
+#include "paddle/fluid/string/printf.h"
+
+#define Conn(x, y) x##y
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_NO_KERNEL_OP(copy_cross_scope);
+
+template <typename T>
+void Compare1(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {1};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::list<f::Scope*>::const_iterator iter = scope->kids().begin();
+  iter++;
+  iter++;
+
+  auto* kid_scope = *iter;
+  auto* dst_var = kid_scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  int expected = 1;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+
+template <typename T>
+void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {0};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  auto* dst_var = scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  int expected = 0;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(copy_cross_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+
+TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#elif PADDLE_WITH_ASCEND_CL
+TEST(copy_cross_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+
+TEST(copy_cross_scope_to_main_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#endif
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
index a51fce8132418b09c8f2db397fc83c8c69a8a429..9b08f875bb6e6d2e7f8dd1a8cdb142198438e34e 100644
--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -12,17 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
-// HIP not supported yet
-
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 
+#ifdef __HIPCC__
+#define __syncwarp() __all(1)
+#endif
+
 namespace paddle {
 namespace operators {
 
+#ifdef __HIPCC__
+#define THREADS_PER_BLOCK 64
+#else
 #define THREADS_PER_BLOCK 32
+#endif
 #define FULL_MASK 0xffffffff
 
 using framework::Tensor;
@@ -30,14 +35,22 @@ using framework::Tensor;
 template <typename T>
 __forceinline__ __device__ T warpReduceSum(T val) {
   for (int offset = 16; offset > 0; offset /= 2) {
+#ifdef __HIPCC__
+    val += __shfl_down(val, offset);
+#else
     val += __shfl_down_sync(FULL_MASK, val, offset);
+#endif
   }
   return val;
 }
 
 template <typename T>
 __forceinline__ __device__ T blockReduceSum(T val) {
+#ifdef __HIPCC__
+  static __shared__ T shared[64];
+#else
   static __shared__ T shared[32];
+#endif
   int lane = threadIdx.x % warpSize;
   int wid = threadIdx.x / warpSize;
 
@@ -483,5 +496,3 @@ REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel<float>,
                         ops::CorrelationCUDAKernel<double>);
 REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel<float>,
                         ops::CorrelationCUDAGradKernel<double>);
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h
index 3181e4b1d990b775f2c80a5d13f391886f83b080..b7859237e737a1af6bfc00c2fd6b5a7b610374e4 100644
--- a/paddle/fluid/operators/cudnn_lstm_cache.h
+++ b/paddle/fluid/operators/cudnn_lstm_cache.h
@@ -85,20 +85,12 @@ class ScopedRNNBase {
     dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_,
                              dropout_state, seed_, state_size);
 
-// ------------------- cudnn rnn descriptors ---------------------
-#if CUDNN_VERSION >= 6000
+    // ------------------- cudnn rnn descriptors ---------------------
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_.desc(), hidden_size_, num_layers_,
         dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
-        CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        cudnn_type));
-#endif
 
 #if CUDNN_VERSION >= 7201
     if (!sequence_length.empty()) {
diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h
index 13a3e7d09b9f628f31bb9ff3b6137acf6d929c5c..a6a23a91c76c02ec656ad1d13aa41c1d3d93c8fd 100644
--- a/paddle/fluid/operators/cudnn_rnn_cache.h
+++ b/paddle/fluid/operators/cudnn_rnn_cache.h
@@ -168,18 +168,11 @@ struct CudnnRNNCache {
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
 
-#if CUDNN_VERSION >= 6000
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
         CUDNN_LINEAR_INPUT,
         is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
         CUDNN_RNN_ALGO_STANDARD, cudnn_type));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
-        rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
-        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
-        cudnn_type));
-#endif
 
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
diff --git a/paddle/fluid/operators/decode_jpeg_op.cc b/paddle/fluid/operators/decode_jpeg_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd82c74885b9496bf64729a74a6527e68c80faf6
--- /dev/null
+++ b/paddle/fluid/operators/decode_jpeg_op.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPUDecodeJpegKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // TODO(LieLinJiang): add cpu implement.
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "DecodeJpeg op only supports GPU now."));
+  }
+};
+
+class DecodeJpegOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg");
+
+    auto mode = ctx->Attrs().Get<std::string>("mode");
+    std::vector<int> out_dims;
+
+    if (mode == "unchanged") {
+      out_dims = {-1, -1, -1};
+    } else if (mode == "gray") {
+      out_dims = {1, -1, -1};
+    } else if (mode == "rgb") {
+      out_dims = {3, -1, -1};
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "The provided mode is not supported for JPEG files on GPU: ", mode));
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const {
+    if (var_name == "X") {
+      return expected_kernel_type;
+    }
+
+    return framework::OpKernelType(tensor.type(), tensor.place(),
+                                   tensor.layout());
+  }
+};
+
+class DecodeJpegOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "A one dimensional uint8 tensor containing the raw bytes "
+             "of the JPEG image. It is a tensor with rank 1.");
+    AddOutput("Out", "The output tensor of DecodeJpeg op");
+    AddComment(R"DOC(
+This operator decodes a JPEG image into a 3 dimensional RGB Tensor 
+or 1 dimensional Gray Tensor. Optionally converts the image to the 
+desired format. The values of the output tensor are uint8 between 0 
+and 255.
+)DOC");
+    AddAttr<std::string>(
+        "mode",
+        "(string, default \"unchanged\"), The read mode used "
+        "for optionally converting the image, can be \"unchanged\" "
+        ",\"gray\" , \"rgb\" .")
+        .SetDefault("unchanged");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    decode_jpeg, ops::DecodeJpegOp, ops::DecodeJpegOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
+
+REGISTER_OP_CPU_KERNEL(decode_jpeg, ops::CPUDecodeJpegKernel<uint8_t>)
diff --git a/paddle/fluid/operators/decode_jpeg_op.cu b/paddle/fluid/operators/decode_jpeg_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..11616b0e0c4daced68e8faf16a319d0c40f66244
--- /dev/null
+++ b/paddle/fluid/operators/decode_jpeg_op.cu
@@ -0,0 +1,138 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP)
+
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/dynload/nvjpeg.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+
+namespace paddle {
+namespace operators {
+
+static cudaStream_t nvjpeg_stream = nullptr;
+static nvjpegHandle_t nvjpeg_handle = nullptr;
+
+void InitNvjpegImage(nvjpegImage_t* img) {
+  for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) {
+    img->channel[c] = nullptr;
+    img->pitch[c] = 0;
+  }
+}
+
+template <typename T>
+class GPUDecodeJpegKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // Create nvJPEG handle
+    if (nvjpeg_handle == nullptr) {
+      nvjpegStatus_t create_status =
+          platform::dynload::nvjpegCreateSimple(&nvjpeg_handle);
+
+      PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS,
+                        platform::errors::Fatal("nvjpegCreateSimple failed: ",
+                                                create_status));
+    }
+
+    nvjpegJpegState_t nvjpeg_state;
+    nvjpegStatus_t state_status =
+        platform::dynload::nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state);
+
+    PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS,
+                      platform::errors::Fatal("nvjpegJpegStateCreate failed: ",
+                                              state_status));
+
+    int components;
+    nvjpegChromaSubsampling_t subsampling;
+    int widths[NVJPEG_MAX_COMPONENT];
+    int heights[NVJPEG_MAX_COMPONENT];
+
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* x_data = x->data<T>();
+
+    nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo(
+        nvjpeg_handle, x_data, (size_t)x->numel(), &components, &subsampling,
+        widths, heights);
+
+    PADDLE_ENFORCE_EQ(
+        info_status, NVJPEG_STATUS_SUCCESS,
+        platform::errors::Fatal("nvjpegGetImageInfo failed: ", info_status));
+
+    int width = widths[0];
+    int height = heights[0];
+
+    nvjpegOutputFormat_t output_format;
+    int output_components;
+
+    auto mode = ctx.Attr<std::string>("mode");
+    if (mode == "unchanged") {
+      if (components == 1) {
+        output_format = NVJPEG_OUTPUT_Y;
+        output_components = 1;
+      } else if (components == 3) {
+        output_format = NVJPEG_OUTPUT_RGB;
+        output_components = 3;
+      } else {
+        platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state);
+        PADDLE_THROW(platform::errors::Fatal(
+            "The provided mode is not supported for JPEG files on GPU"));
+      }
+    } else if (mode == "gray") {
+      output_format = NVJPEG_OUTPUT_Y;
+      output_components = 1;
+    } else if (mode == "rgb") {
+      output_format = NVJPEG_OUTPUT_RGB;
+      output_components = 3;
+    } else {
+      platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state);
+      PADDLE_THROW(platform::errors::Fatal(
+          "The provided mode is not supported for JPEG files on GPU"));
+    }
+
+    nvjpegImage_t out_image;
+    InitNvjpegImage(&out_image);
+
+    // create nvjpeg stream
+    if (nvjpeg_stream == nullptr) {
+      cudaStreamCreateWithFlags(&nvjpeg_stream, cudaStreamNonBlocking);
+    }
+
+    int sz = widths[0] * heights[0];
+
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int64_t> out_shape = {output_components, height, width};
+    out->Resize(framework::make_ddim(out_shape));
+
+    T* data = out->mutable_data<T>(ctx.GetPlace());
+
+    for (int c = 0; c < output_components; c++) {
+      out_image.channel[c] = data + c * sz;
+      out_image.pitch[c] = width;
+    }
+
+    nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode(
+        nvjpeg_handle, nvjpeg_state, x_data, x->numel(), output_format,
+        &out_image, nvjpeg_stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(decode_jpeg, ops::GPUDecodeJpegKernel<uint8_t>)
+
+#endif
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
index b4c27a63dbd2f2fdbd9b018aa1606a79d5b0002d..388b8531571086f8ba464a51a4ebce7fb816bc31 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@@ -49,14 +49,11 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
     anchor_width = scale_w * base_w;
     anchor_height = scale_h * base_h;
 
-    T xmin = (x_ctr - 0.5 * (anchor_width - 1));
-    T ymin = (y_ctr - 0.5 * (anchor_height - 1));
-    T xmax = (x_ctr + 0.5 * (anchor_width - 1));
-    T ymax = (y_ctr + 0.5 * (anchor_height - 1));
-    out[i * 4] = xmin;
-    out[i * 4 + 1] = ymin;
-    out[i * 4 + 2] = xmax;
-    out[i * 4 + 3] = ymax;
+    T xmin = (x_ctr - .5f * (anchor_width - 1));
+    T ymin = (y_ctr - .5f * (anchor_height - 1));
+    T xmax = (x_ctr + .5f * (anchor_width - 1));
+    T ymax = (y_ctr + .5f * (anchor_height - 1));
+    reinterpret_cast<float4*>(out)[i] = make_float4(xmin, ymin, xmax, ymax);
   }
 }
 
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
index e0e499d76a19ba5f6b91ba4c8797684fb53c7caa..599f6935736f946bc021cf70177a45ed2b9679e3 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -22,6 +22,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+extern __global__ void GenAnchors(T* out, const T* aspect_ratios,
+                                  const int ar_num, const T* anchor_sizes,
+                                  const int as_num, const T* stride,
+                                  const int sd_num, const int height,
+                                  const int width, const T offset);
+
+template <typename T>
+extern __global__ void SetVariance(T* out, const T* var, const int vnum,
+                                   const int num);
+#endif
+
 template <typename T>
 class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 27852d43948327c498e73a51e4151a25c31f64c3..725983f8153e4f14f41552ae26800d363a863ec1 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -66,7 +66,8 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
   // Determine temporary device storage requirements
   size_t temp_storage_bytes = 0;
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
+      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num, 0,
+      sizeof(T) * 8, ctx.stream());
   // Allocate temporary storage
   auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
   auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
@@ -74,7 +75,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx,
   // Run sorting operation
   cub::DeviceRadixSort::SortPairsDescending<T, int>(
       d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
-      idx_out, num);
+      idx_out, num, 0, sizeof(T) * 8, ctx.stream());
 }
 
 template <typename T>
@@ -275,15 +276,19 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
 
   const T *boxes = proposals.data<T>();
   auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-  framework::Vector<uint64_t> mask(boxes_num * col_blocks);
-  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes,
-                                 mask.CUDAMutableData(BOOST_GET_CONST(
-                                     platform::CUDAPlace, ctx.GetPlace())),
-                                 pixel_offset);
+  auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t));
+  uint64_t *mask_dev = reinterpret_cast<uint64_t *>(mask_ptr->ptr());
+
+  NMSKernel<<<blocks, threads, 0, ctx.stream()>>>(
+      boxes_num, nms_threshold, boxes, mask_dev, pixel_offset);
 
   std::vector<uint64_t> remv(col_blocks);
   memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
 
+  std::vector<uint64_t> mask_host(boxes_num * col_blocks);
+  memory::Copy(platform::CPUPlace(), mask_host.data(), place, mask_dev,
+               boxes_num * col_blocks * sizeof(uint64_t), ctx.stream());
+
   std::vector<int> keep_vec;
   int num_to_keep = 0;
   for (int i = 0; i < boxes_num; i++) {
@@ -293,7 +298,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
     if (!(remv[nblock] & (1ULL << inblock))) {
       ++num_to_keep;
       keep_vec.push_back(i);
-      uint64_t *p = &mask[0] + i * col_blocks;
+      uint64_t *p = mask_host.data() + i * col_blocks;
       for (int j = nblock; j < col_blocks; j++) {
         remv[j] |= p[j];
       }
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index bc74c80e0315fac6de3ca575d53b23965adf4179..ffd9ac6b2af806ed92bea6484a077cc83de7af3c 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -144,7 +144,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     size_t temp_storage_bytes = 0;
     cub::DeviceRadixSort::SortPairsDescending<T, int>(
         nullptr, temp_storage_bytes, concat_scores.data<T>(), keys_out, idx_in,
-        idx_out, total_roi_num);
+        idx_out, total_roi_num, 0, sizeof(T) * 8, dev_ctx.stream());
     // Allocate temporary storage
     auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
@@ -152,7 +152,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     // sort score to get corresponding index
     cub::DeviceRadixSort::SortPairsDescending<T, int>(
         d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data<T>(),
-        keys_out, idx_in, idx_out, total_roi_num);
+        keys_out, idx_in, idx_out, total_roi_num, 0, sizeof(T) * 8,
+        dev_ctx.stream());
     index_out_t.Resize({real_post_num});
     Tensor sorted_rois;
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
@@ -176,7 +177,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     temp_storage_bytes = 0;
     cub::DeviceRadixSort::SortPairs<int, int>(
         nullptr, temp_storage_bytes, sorted_batch_id.data<int>(), out_id_data,
-        batch_idx_in, index_out_t.data<int>(), real_post_num);
+        batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
+        sizeof(int) * 8, dev_ctx.stream());
     // Allocate temporary storage
     d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
@@ -184,7 +186,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     // sort batch_id to get corresponding index
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data<int>(),
-        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num);
+        out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
+        sizeof(int) * 8, dev_ctx.stream());
 
     GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
@@ -198,8 +201,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     int threads = kNumCUDAThreads;
 
     // get length-based lod by batch ids
-    GetLengthLoD<<<blocks, threads>>>(real_post_num, out_id_data,
-                                      length_lod_data);
+    GetLengthLoD<<<blocks, threads, 0, dev_ctx.stream()>>>(
+        real_post_num, out_id_data, length_lod_data);
     std::vector<int> length_lod_cpu(lod_size);
     memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place,
                  length_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index cc61035309eaab31534119ab088bf537bf71c242..7ccb354e1773a35957f9df97d26d79d50cbd4fd6 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     int dist_blocks = NumBlocks(roi_num);
     int threads = kNumCUDAThreads;
     // get target levels and sub_lod list
-    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads>>>(
+    GPUDistFpnProposalsHelper<T><<<dist_blocks, threads, 0, dev_ctx.stream()>>>(
         roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
         max_level, min_level, roi_batch_id_list_gpu.data<int>(),
         sub_lod_list_data, target_lvls_data, pixel_offset);
-    dev_ctx.Wait();
     auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
 
     Tensor index_in_t;
@@ -150,9 +149,9 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     // Determine temporary device storage requirements
     size_t temp_storage_bytes = 0;
-    cub::DeviceRadixSort::SortPairs<int, int>(nullptr, temp_storage_bytes,
-                                              target_lvls_data, keys_out,
-                                              idx_in, idx_out, roi_num);
+    cub::DeviceRadixSort::SortPairs<int, int>(
+        nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in,
+        idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream());
     // Allocate temporary storage
     auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
 
@@ -160,29 +159,30 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     // sort target level to get corresponding index
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
-        idx_in, idx_out, roi_num);
+        idx_in, idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream());
 
     int* restore_idx_data =
         restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
     // sort current index to get restore index
     cub::DeviceRadixSort::SortPairs<int, int>(
         d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
-        restore_idx_data, roi_num);
+        restore_idx_data, roi_num, 0, sizeof(int) * 8, dev_ctx.stream());
 
     int start = 0;
     auto multi_rois_num = ctx.MultiOutput<Tensor>("MultiLevelRoIsNum");
 
+    std::vector<int> sub_lod_list_cpu(lod_size * num_level);
+    memory::Copy(platform::CPUPlace(), sub_lod_list_cpu.data(), place,
+                 sub_lod_list_data, sizeof(int) * lod_size * num_level,
+                 dev_ctx.stream());
+    dev_ctx.Wait();
+
     for (int i = 0; i < num_level; ++i) {
       Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
-      int* sub_lod_data = sub_lod.data<int>();
       // transfer length-based lod to offset-based lod
       std::vector<size_t> offset(1, 0);
-      std::vector<int> sub_lod_cpu(lod_size);
-      memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place,
-                   sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream());
-      dev_ctx.Wait();
       for (int j = 0; j < lod_size; ++j) {
-        offset.emplace_back(offset.back() + sub_lod_cpu[j]);
+        offset.emplace_back(offset.back() + sub_lod_list_cpu[i * lod_size + j]);
       }
 
       int sub_rois_num = offset.back();
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 8359fbab519b36f58fbeaf02082f02a1372993fc..e8ab628db16bdd591adb670bafc5e05aeac8efed 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
       memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
                    scores.data<T>(), sizeof(T) * scores.numel(),
                    dev_ctx.stream());
-      dev_ctx.Wait();
       num_proposals += proposals.dims()[0];
       offset.emplace_back(num_proposals);
       tmp_num.push_back(proposals.dims()[0]);
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index 337a76f9f976f852973c4dd413427890a2ef37b3..5977a434a6023f4463c9fb7ab1067f038747e44c 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -45,7 +45,8 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
+        platform::errors::InvalidArgument(
+            "The polygon_box_transform operator needs to be executed on GPU."));
     auto* in = ctx.Input<Tensor>("Input");
     auto in_dims = in->dims();
     const T* in_data = in->data<T>();
diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu
index 2e03622e10f0f477cd0b66c0bcafe9f7bf29a0f8..7e3ab6be664cb92370d50688ab93f9462ec89463 100644
--- a/paddle/fluid/operators/diag_embed_op.cu
+++ b/paddle/fluid/operators/diag_embed_op.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diag_embed_op.h"
 
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
index a2279e40623b4ba2f0421e73a6148b89eb970e71..6a34ef48a169dc5e31f845f9993eef721faf2e7c 100644
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -167,6 +167,7 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
   auto sign =
       (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
       (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
+  T epsilon = static_cast<T>(1.0e-10f);
 
   // 1: Lp-norm(z), z = x-y, compute dz
   if (p == 0) {
@@ -189,12 +190,14 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
     // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
     if (platform::is_cpu_place(context.GetPlace())) {
       grad_t.device(place) =
-          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
           sign.eval() * out_grad_t.broadcast(out_bcast_dims);
     } else {
       grad_t.device(place) =
-          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
-          out_grad_t.broadcast(out_bcast_dims);
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign * out_grad_t.broadcast(out_bcast_dims);
     }
   }
 
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
deleted file mode 100644
index c9db6148bc45d44e8336fc662ae1b4337445f738..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-return()
-
-if(WITH_GRPC)
-    set(cc_generic_services "false")
-else()
-    set(cc_generic_services "true")
-endif()
-configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
-
-cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool)
-cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder)
-
-cc_library(heart_beat_monitor SRCS heart_beat_monitor.cc DEPS enforce simple_threadpool)
-cc_library(large_scale_kv SRCS large_scale_kv.cc DEPS enforce simple_threadpool device_context)
-cc_test(heart_beat_monitor_test SRCS heart_beat_monitor_test.cc DEPS heart_beat_monitor)
-
-# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-if(WITH_GRPC)
-  set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr zlib protobuf)
-  set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc)
-  grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-        request_handler_impl.cc rpc_client.cc rpc_server.cc
-        variable_response.cc
-        collective_client.cc collective_server.cc
-        ${GRPC_SRCS}
-      PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder heart_beat_monitor large_scale_kv)
-
-  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
-
-  cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc 
-    DEPS ${RPC_DEPS} scope profiler math_function)
-
-else()
-  set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc)
-  set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
-  set(BRPC_DEPS brpc ssl crypto protobuf leveldb zlib)
-
-  brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc
-      request_handler_impl.cc rpc_client.cc rpc_server.cc
-      variable_response.cc
-      collective_client.cc collective_server.cc
-      ${BRPC_SRCS}
-    PROTO send_recv.proto
-    DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS})
-
-  set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS})
-  cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc
-      DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_read_op)
-endif()
-
-
-cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op checkpoint_notify_op scale_op )
-cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
-cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory node)
-cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
-cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
-cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
-cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
-if(WITH_GPU OR WITH_ROCM)
-    cc_test(collective_server_test SRCS collective_server_test.cc 
-        DEPS sendrecvop_rpc executor ${RPC_DEPS}
-        selected_rows_functor  scope math_function)
-endif()
-if(WITH_TESTING)
-    if(TEST rpc_server_test)
-        set_tests_properties(rpc_server_test PROPERTIES TIMEOUT 120)
-    endif()
-    if(TEST heart_beat_monitor_test)
-        set_tests_properties(heart_beat_monitor_test PROPERTIES TIMEOUT 120)
-    endif()
-endif()
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
deleted file mode 100644
index 28a5f2ad6c7648668bc75c130ab317ba4ab93cc7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
+++ /dev/null
@@ -1,186 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class ConcurrentSet {
- public:
-  ConcurrentSet() : pool_(new ::ThreadPool(1)) {}
-  ~ConcurrentSet() {}
-
-  std::future<void> Update(const std::vector<int64_t>& rows) {
-    auto task = [this, rows] {
-      if (VLOG_IS_ON(3)) {
-        std::ostringstream sstream;
-        sstream << "[";
-        for (auto& id : rows) {
-          sstream << id << ", ";
-        }
-        sstream << "]";
-        VLOG(3) << "update ids -> " << sstream.str();
-      }
-      for (auto row : rows) {
-        set_.insert(row);
-      }
-    };
-    return pool_->enqueue(std::move(task));
-  }
-
-  std::future<void> GetAndClear(std::vector<int64_t>* result) {
-    auto task = [this, &result] {
-      result->clear();
-      for (auto& id : set_) {
-        result->push_back(id);
-      }
-      if (VLOG_IS_ON(3)) {
-        std::ostringstream sstream;
-        sstream << "[";
-        for (auto& id : *result) {
-          sstream << id << ", ";
-        }
-        sstream << "]";
-        VLOG(3) << "result ids size: " << result->size() << " "
-                << sstream.str();
-      }
-      set_.clear();
-    };
-    return pool_->enqueue(std::move(task));
-  }
-
- private:
-  std::unordered_set<int64_t> set_;
-  std::unique_ptr<::ThreadPool> pool_{nullptr};
-};
-
-class AsyncSparseParamUpdateRecorder {
-  using TrainerToRows = std::vector<std::unique_ptr<ConcurrentSet>>;
-
- public:
-  AsyncSparseParamUpdateRecorder(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param)
-      : trainer_num_(trainer_num), grad_to_param_(grad_to_param) {
-    if (VLOG_IS_ON(3)) {
-      std::ostringstream sstream;
-      sstream << "[";
-      for (auto& item : grad_to_param) {
-        sstream << item.first << ":" << item.second << ", ";
-      }
-      sstream << "]";
-      VLOG(3) << "trainer_num: " << trainer_num
-              << " grad_to_param_: " << sstream.str();
-    }
-    for (auto& iter : grad_to_param) {
-      param_to_grad_[iter.second] = iter.first;
-      auto& param_name = iter.second;
-      param_to_updated_rows_[param_name] = TrainerToRows();
-      auto& trainer_to_rows = param_to_updated_rows_[param_name];
-      for (auto i = 0; i < trainer_num; ++i) {
-        trainer_to_rows.emplace_back(new ConcurrentSet());
-      }
-    }
-  }
-
-  ~AsyncSparseParamUpdateRecorder() = default;
-
-  void Update(const std::string& grad_name,
-              const std::vector<int64_t>& update_rows) {
-    VLOG(3) << "update grad: " << grad_name
-            << " row size: " << update_rows.size();
-    auto& param_name = grad_to_param_.at(grad_name);
-    auto& trainer_to_rows = param_to_updated_rows_.at(param_name);
-
-    std::vector<std::future<void>> fs;
-    for (auto& set : trainer_to_rows) {
-      fs.push_back(set->Update(update_rows));
-    }
-    for (auto& f : fs) {
-      f.wait();
-    }
-  }
-
-  void GetAndClear(const std::string& param_name, int trainer_id,
-                   std::vector<int64_t>* result) {
-    VLOG(3) << "GetAndClear param: " << param_name
-            << " for trainer: " << trainer_id;
-    PADDLE_ENFORCE_LT(
-        trainer_id, trainer_num_,
-        platform::errors::InvalidArgument(
-            "The value of trainer_id: %s should less than trainer_num: %s.",
-            trainer_id, trainer_num_));
-    param_to_updated_rows_.at(param_name)[trainer_id]
-        ->GetAndClear(result)
-        .wait();
-  }
-
-  bool HasParam(const std::string& param_name) {
-    return param_to_grad_.find(param_name) != param_to_grad_.end();
-  }
-
-  bool HasGrad(const std::string& grad_name) {
-    return grad_to_param_.find(grad_name) != grad_to_param_.end();
-  }
-
- private:
-  const int trainer_num_;
-  std::unordered_map<std::string, std::string> grad_to_param_;
-  std::unordered_map<std::string, std::string> param_to_grad_;
-  std::unordered_map<std::string, TrainerToRows> param_to_updated_rows_;
-
-  // init recorder
- public:
-  static void Init(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param) {
-    InitImpl(trainer_num, grad_to_param);
-  }
-
-  static AsyncSparseParamUpdateRecorder* GetInstance() {
-    return recorder_.get();
-  }
-
- private:
-  // Init is called by GetInstance.
-  static void InitImpl(
-      int trainer_num,
-      const std::unordered_map<std::string, std::string>& grad_to_param) {
-    if (recorder_ == nullptr) {
-      recorder_.reset(
-          new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param));
-    }
-  }
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<AsyncSparseParamUpdateRecorder> recorder_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
deleted file mode 100644
index 2d78559625c91fadec1fbb282b08e542a07d964a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include <algorithm>
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-TEST(ConcurrentSet, All) {
-  ConcurrentSet concurrent_set;
-  std::vector<int64_t> in1 = {1, 2, 3, 4};
-  std::vector<int64_t> in2 = {2, 3, 5, 6};
-
-  std::vector<std::future<void>> futures;
-  futures.push_back(concurrent_set.Update(in1));
-  futures.push_back(concurrent_set.Update(in2));
-
-  for (auto &f : futures) {
-    f.wait();
-  }
-
-  std::unordered_set<int64_t> in;
-  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
-  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
-
-  std::vector<int64_t> ret;
-  concurrent_set.GetAndClear(&ret).wait();
-
-  std::unordered_set<int64_t> out;
-  std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
-
-  EXPECT_EQ(in, out);
-
-  concurrent_set.GetAndClear(&ret).wait();
-  EXPECT_EQ(ret.size(), 0UL);
-}
-
-TEST(AsyncSparseParamUpdateRecorder, All) {
-  std::unordered_map<std::string, std::string> grad_to_param;
-  grad_to_param["grad1"] = "param1";
-  grad_to_param["grad2"] = "param2";
-
-  int trainer_num = 10;
-
-  AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param);
-  std::vector<int64_t> in1 = {1, 2, 3, 4};
-  std::vector<int64_t> in2 = {2, 3, 5, 6};
-
-  std::unordered_set<int64_t> in;
-  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
-  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
-
-  recorder.Update("grad1", in1);
-  recorder.Update("grad1", in2);
-
-  EXPECT_TRUE(recorder.HasParam("param1"));
-  EXPECT_TRUE(recorder.HasParam("param2"));
-  EXPECT_FALSE(recorder.HasParam("param3"));
-
-  EXPECT_TRUE(recorder.HasGrad("grad1"));
-  EXPECT_TRUE(recorder.HasGrad("grad2"));
-  EXPECT_FALSE(recorder.HasGrad("grad3"));
-
-  std::vector<int64_t> ret;
-  EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret));
-
-  for (int i = 0; i < trainer_num; ++i) {
-    std::vector<int64_t> ret;
-    std::unordered_set<int64_t> out;
-
-    recorder.GetAndClear("param1", i, &ret);
-    std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
-
-    EXPECT_EQ(in, out);
-
-    recorder.GetAndClear("param1", i, &ret);
-    EXPECT_EQ(ret.size(), 0UL);
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
deleted file mode 100644
index b2a26089c868968734df35383f2ddf2ba512b137..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ /dev/null
@@ -1,462 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
-DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");
-
-BRPCClient::~BRPCClient() { Wait(); }
-
-void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response,
-                        VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                        ChannelContextPtr ch_ctx, BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);
-
-  // this channel can be used by other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Failed to send variable %s, error text is %s.", var_h->name(),
-        cntl->ErrorText()));
-    var_h->Finish(false);
-    cls->DecreaseReqCount();
-    return;
-  }
-  var_h->Finish(true);
-  cls->DecreaseReqCount();
-
-  VLOG(4) << "HandleSendResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-  VLOG(4) << "Finish HandleSendResponse";
-}
-
-VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = kSendRPC;
-  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-    cntl->set_timeout_ms(time_out);
-
-    auto* var = p_scope->FindVar(var_name_val);
-    sendrecv::VariableMessage request;
-    distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request,
-                                  &cntl->request_attachment(), "", false,
-                                  trainer_id_);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    platform::RecordRPCEvent record_event(method);
-
-    ch_ctx->stub->SendVariable(cntl, &request, response, done);
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-  req_count_++;
-
-  return var_h;
-}
-void HandleFetchBarrierResponse(brpc::Controller* cntl,
-                                sendrecv::VariableMessage* response,
-                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                ChannelContextPtr ch_ctx, BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
-
-  // this channel can be used other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Failed to get HandleFetchBarrierResponse %s, error text is %s.",
-        var_h->name(), cntl->ErrorText()));
-    var_h->Finish(false);
-    cls->DecreaseReqCount();
-    return;
-  }
-
-  var_h->Finish(true);
-  cls->DecreaseReqCount();
-
-  VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-  VLOG(4) << "Finish HandleFetchBarrierResponse";
-}
-void HandleGetResponse(brpc::Controller* cntl,
-                       sendrecv::VariableMessage* response, VarHandlePtr var_h,
-                       ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx,
-                       BRPCClient* cls) {
-  // std::unique_ptr makes sure cntl/response will be deleted before returning.
-  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
-  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
-
-  // this channel can be used other now.
-  ch_ptr->Push(ch_ctx);
-
-  if (cntl->Failed()) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Failed to get variable %s, error text is %s.", var_h->name(),
-        cntl->ErrorText()));
-    cls->DecreaseReqCount();
-    var_h->Finish(false);
-    return;
-  }
-
-  VLOG(4) << "HandleGetResponse from: " << cntl->remote_side()
-          << ", varname: " << var_h->name()
-          << ", latency: " << cntl->latency_us() << "us";
-
-  framework::Variable* outvar = nullptr;
-  int trainer_id;
-  distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(),
-                                    *var_h->ctx(), var_h->scope(), &outvar,
-                                    &trainer_id);
-  VLOG(4) << "Finish HandleGetResponse";
-  cls->DecreaseReqCount();
-  var_h->Finish(true);
-}
-
-VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      const std::string& out_var_name,
-                                      const std::string& method_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const std::string out_varname_val = out_var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-  const std::string method = kGetRPC;
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-    cntl->set_timeout_ms(time_out);
-
-    sendrecv::VariableMessage req;
-    req.set_varname(var_name_val);
-    req.set_out_varname(out_varname_val);
-    req.set_trainer_id(trainer_id_);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    platform::RecordRPCEvent record_event(method);
-
-    if (method_name == kGetMonomerRPC) {
-      ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
-    } else if (method_name == kGetNoBarrierRPC) {
-      ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done);
-    } else {
-      ch_ctx->stub->GetVariable(cntl, &req, response, done);
-    }
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-
-  req_count_++;
-
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncGetVarNoBarrier(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    const std::string& out_var_name, int64_t time_out) {
-  std::string var_name_no_barrier =
-      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
-
-  return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name,
-                      kGetNoBarrierRPC, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC,
-                      time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
-                                                const std::string& var_name,
-                                                int64_t time_out) {
-  return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
-                                     const platform::DeviceContext& ctx,
-                                     const framework::Scope& scope,
-                                     const std::string& var_name,
-                                     const std::string& out_var_name,
-                                     const std::string& table_name,
-                                     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
-                      time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& in_var_name,
-                                          const std::string& out_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch_ptr = GetChannel(ep_val);
-
-  const std::string method = kPrefetchRPC;
-
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
-
-  framework::AsyncIO([=] {
-    auto ch_ctx = ch_ptr->Pop();
-
-    brpc::Controller* cntl = new brpc::Controller();
-    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-    cntl->set_timeout_ms(time_out);
-
-    auto* var = p_scope->FindVar(in_var_name_val);
-    sendrecv::VariableMessage req;
-    distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req,
-                                  &cntl->request_attachment(), out_var_name_val,
-                                  false, 0, table_name_val);
-
-    platform::RecordRPCEvent record_event(method);
-
-    google::protobuf::Closure* done = brpc::NewCallback(
-        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-    ch_ctx->stub->PrefetchVariable(cntl, &req, response, done);
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      var_h->Wait();
-    }
-  });
-
-  req_count_++;
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE,
-                          time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  auto ch_ptr = GetChannel(ep);
-  auto ch_ctx = ch_ptr->Pop();
-
-  brpc::Controller* cntl = new brpc::Controller();
-  sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
-  cntl->set_timeout_ms(time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-
-  const std::string method = kFetchBarrierRPC;
-  // var handle
-  VarHandlePtr var_h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
-
-  platform::RecordRPCEvent record_event(method);
-
-  google::protobuf::Closure* done = brpc::NewCallback(
-      &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-  ch_ctx->stub->GetVariable(cntl, &req, response, done);
-
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    var_h->Wait();
-  }
-
-  return var_h;
-}
-
-bool BRPCClient::Wait() {
-  VLOG(9) << "begin to brpcclient wait";
-  {
-    std::unique_lock<std::mutex> lk(sync_mutex_);
-    sync_cond_.wait(lk, [this] { return req_count_ == 0; });
-  }
-  VLOG(9) << "end to brpcclient wait";
-  return true;
-}
-
-ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
-  VLOG(4) << "begin to GetChannel:" << ep;
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    auto it = channels_.find(ep);
-    if (it != channels_.end()) {
-      VLOG(4) << "end to GetChannel:" << ep;
-      return it->second;
-    }
-  }
-
-  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());
-
-  brpc::ChannelOptions options;
-#ifdef PADDLE_WITH_BRPC_RDMA
-  options.use_rdma = true;
-#endif
-  options.protocol = "baidu_std";
-  // don't use pooled type. the server can't afford that.
-  options.connection_type = "single";
-  options.connect_timeout_ms = 1000;
-  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
-  options.max_retry = FLAGS_max_retry;
-
-  VLOG(1) << "create " << brpc_channel_num_per_server_
-          << " brpc channels to pserver:" << ep;
-
-  for (int i = 0; i < brpc_channel_num_per_server_; ++i) {
-    std::shared_ptr<ChannelContext> c(new ChannelContext());
-    if (c->channel.Init(ep.c_str(), &options) != 0) {
-      PADDLE_THROW(
-          platform::errors::Unavailable("Failed to initialize channel."));
-      return nullptr;
-    }
-
-    c->stub.reset(new sendrecv::SendRecvService_Stub(
-        static_cast<google::protobuf::RpcChannel*>(&c->channel)));
-    q->Push(c);
-  }
-
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    channels_[ep] = q;
-  }
-
-  VLOG(4) << "end to GetChannel:" << ep;
-  return q;
-}
-
-VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
-                                           int64_t time_out) {
-  return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out);
-}
-
-void BRPCClient::SendComplete() {
-  for (auto& kv : channels_) {
-    AsyncSendComplete(kv.first);
-  }
-}
-
-VarHandlePtr BRPCClient::AsyncSendVarMessage(
-    const std::string& ep, const std::string& method_name,
-    const sendrecv::VariableMessage& req, int64_t time_out) {
-  auto ch_ptr = GetChannel(ep);
-  auto ch_ctx = ch_ptr->Pop();
-
-  brpc::Controller* cntl = new brpc::Controller();
-  sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
-  cntl->set_timeout_ms(time_out);
-
-  platform::RecordRPCEvent record_event(method_name);
-
-  VarHandlePtr var_h(
-      new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
-
-  google::protobuf::Closure* done = brpc::NewCallback(
-      &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
-
-  if (method_name == kCheckPointNotifyRPC) {
-    ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
-  } else if (method_name == kSendMonomerFetchBarrierRPC) {
-    ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
-  } else {
-    ch_ctx->stub->SendVariable(cntl, &req, response, done);
-  }
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    var_h->Wait();
-  }
-
-  return var_h;
-}
-
-VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep,
-                                          const std::string& method_name,
-                                          const std::string& message,
-                                          int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(message);
-
-  return AsyncSendVarMessage(ep, method_name, req, time_out);
-}
-
-VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                               const std::string& dirname,
-                                               const std::string& varname,
-                                               const int mode,
-                                               int64_t time_out) {
-  sendrecv::VariableMessage req;
-  req.set_varname(varname);
-  req.set_out_varname(dirname);
-
-  return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out);
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
deleted file mode 100644
index 91f94b4c9d5a3076e17b0b54cf153e2f3b9edc31..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-
-#include <chrono>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-struct ChannelContext {
-  brpc::Channel channel;
-  std::shared_ptr<sendrecv::SendRecvService_Stub> stub;
-};
-
-typedef std::shared_ptr<ChannelContext> ChannelContextPtr;
-typedef std::shared_ptr<framework::BlockingQueue<ChannelContextPtr>>
-    ChannelQueuePtr;
-
-class BRPCClient : public RPCClient {
- public:
-  BRPCClient() {}
-  virtual ~BRPCClient();
-
-  VarHandlePtr AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           const std::string& out_var_name,
-                           const std::string& table_name = "",
-                           int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep,
-                                    const platform::DeviceContext& ctx,
-                                    const framework::Scope& scope,
-                                    const std::string& var_name,
-                                    const std::string& out_varname,
-                                    int64_t time_out = FLAGS_rpc_deadline);
-
-  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dirname,
-      const std::string& varname, const int mode,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool Wait() override;
-
-  void SendComplete() override;
-
- private:
-  VarHandlePtr _AsyncGetVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_var_name, const std::string& method_name,
-      const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline);
-
-  void Proceed();
-  ChannelQueuePtr GetChannel(const std::string& ep);
-
-  VarHandlePtr AsyncSendComplete(const std::string& ep,
-                                 int64_t time_out = FLAGS_rpc_deadline);
-
-  VarHandlePtr AsyncSendMessage(const std::string& ep,
-                                const std::string& method_name,
-                                const std::string& message, int64_t time_out);
-
-  VarHandlePtr AsyncSendVarMessage(const std::string& ep,
-                                   const std::string& method_name,
-                                   const sendrecv::VariableMessage& req,
-                                   int64_t time_out);
-
-  friend void HandleSendResponse(brpc::Controller* cntl,
-                                 sendrecv::VoidMessage* response,
-                                 VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                 ChannelContextPtr ch_ctx, BRPCClient* cls);
-
-  friend void HandleGetResponse(brpc::Controller* cntl,
-                                sendrecv::VariableMessage* response,
-                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
-                                ChannelContextPtr ch_ctx, BRPCClient* cls);
-
-  friend void HandleFetchBarrierResponse(brpc::Controller* cntl,
-                                         sendrecv::VariableMessage* response,
-                                         VarHandlePtr var_h,
-                                         ChannelQueuePtr ch_ptr,
-                                         ChannelContextPtr ch_ctx,
-                                         BRPCClient* cls);
-  void DecreaseReqCount() {
-    if (--req_count_ <= 0) {
-      sync_cond_.notify_all();
-    }
-  }
-
- private:
-  std::unordered_map<std::string, ChannelQueuePtr> channels_;
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-
-  static constexpr int brpc_channel_num_per_server_ = 4;
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(BRPCClient);
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
deleted file mode 100644
index 94f0b9919ace835e32d2e1a25e7738a9e88ed7bd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
-#include "brpc/channel.h"
-#include "brpc/rdma/rdma_helper.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-RdmaMemPool& RdmaMemPool::Instance() {
-  static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool();
-  return *g_rdma_mem_pool;
-}
-
-void* RdmaMemPool::Find(const std::string& varname, int64_t size) {
-  pthread_rwlock_rdlock(&access_);
-  auto it = pool_.find(varname);
-  if (it == pool_.end()) {
-    pthread_rwlock_unlock(&access_);
-    return nullptr;
-  }
-
-  auto info = it->second;
-  if (info.data_size != size) {
-    pthread_rwlock_unlock(&access_);
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "var:%s size:%ld != %ld", varname, size, info.data_size));
-    return nullptr;
-  }
-
-  pthread_rwlock_unlock(&access_);
-  return info.data;
-}
-
-void RdmaMemPool::Register(const std::string& varname, void* data,
-                           int64_t data_size) {
-  void* old = Find(varname, data_size);
-  if (old != nullptr) {
-    PADDLE_ENFORCE_EQ(
-        data, old, platform::errors::InvalidArgument("var:%s data:%ld != %ld",
-                                                     varname, data, old));
-    VLOG(7) << "Find on rdma:" << varname << " data:" << data
-            << " data_size:" << data_size;
-    return;
-  }
-
-  VarInfo info;
-  info.data = data;
-  info.data_size = data_size;
-
-  pthread_rwlock_wrlock(&access_);
-  pool_[varname] = info;
-  pthread_rwlock_unlock(&access_);
-
-  if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) {
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Register memory for RDMA failed. Register %s data: %s data size %d "
-        "error.",
-        varname, data, data_size));
-  }
-
-  VLOG(4) << "register on rdma:" << varname << " data:" << data
-          << " data_size:" << data_size;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
deleted file mode 100644
index 156a93ec5784715c0a68c1af2e31d640dfc60277..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#ifdef PADDLE_WITH_BRPC_RDMA
-
-#include <pthread.h>  // NOLINT
-#include <string>
-#include <unordered_map>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-/*
- * This class is used to avoid duplicated registion of brpc::rdma.
- */
-class RdmaMemPool {
- public:
-  static RdmaMemPool& Instance();
-  RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {}
-
-  virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); }
-
-  void Register(const std::string& varname, void* data, int64_t size);
-  void* Find(const std::string& varname, int64_t size);
-
- private:
-  struct VarInfo {
-    void* data;
-    int64_t data_size;
-
-    VarInfo() : data(nullptr), data_size(0) {}
-  };
-
- private:
-  std::unordered_map<std::string, VarInfo> pool_;
-  pthread_rwlock_t access_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
deleted file mode 100644
index 411c0f36debd3b66014648f3bcad395d1b4a3579..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
-#ifdef PADDLE_WITH_RCCL
-#include <rccl.h>
-#endif
-#include <sys/time.h>
-#include <limits>
-#include <memory>
-#include <thread>  // NOLINT
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class IOBufWriter {
- public:
-  static void Append(const std::string& varname, butil::IOBuf* iobuf, int k,
-                     const char* v, int64_t vlen) {
-    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
-      PADDDLE_THROW(platform::errors::Unavailable(
-          "Variable lenght is invalid. Variable name is %s, length is %d.",
-          varname, vlen));
-    }
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-    iobuf->append(v, vlen);
-  }
-
-  static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v,
-                                int64_t vlen, bool in_cuda_pinned,
-                                void (*destroy)(void*), void* user_data) {
-    VLOG(7) << "AppendTCPZeroCopy "
-            << " k:" << k
-            << " data:" << static_cast<void*>(const_cast<char*>(v))
-            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-
-    // FIXME(gongwb): use append_zerocopy
-    /*
-    if (in_cuda_pinned) {
-      iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory);
-    } else {
-      iobuf->append_zerocopy(v, vlen, nullptr);
-    }
-    */
-    iobuf->append(v, vlen);
-    destroy(user_data);
-  }
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-  static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf,
-                                 int k, const char* v, int64_t vlen,
-                                 bool in_cuda_pinned, void (*destroy)(void*),
-                                 void* user_data) {
-    VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k
-            << " data:" << static_cast<void*>(const_cast<char*>(v))
-            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
-
-    iobuf->append(reinterpret_cast<char*>(&k), 4);
-    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
-
-    RdmaMemPool::Instance().Register(
-        varname, static_cast<void*>(const_cast<char*>(v)), vlen);
-
-    // FIXME(gongwb): use append_zerocopy
-    // iobuf->append_zerocopy(v, vlen, nullptr);
-    iobuf->append(v, vlen);
-    destroy(user_data);
-    return;
-  }
-#endif
-
-  static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf,
-                             int k, const char* v, int64_t vlen,
-                             bool in_cuda_pinned, void (*destroy)(void*),
-                             void* user_data) {
-    if (vlen >= std::numeric_limits<int>::max() || vlen < 0) {
-      PADDDLE_THROW(platform::errors::Unavailable(
-          "Variable lenght is invalid. Variable name is %s, length is %d.",
-          varname, vlen));
-    }
-
-#ifdef PADDLE_WITH_BRPC_RDMA
-    IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
-                                    destroy, user_data);
-#else
-    IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy,
-                                   user_data);
-#endif
-  }
-};
-
-void SerializeToIOBuf(const std::string& name, framework::Variable* var,
-                      const platform::DeviceContext& ctx, VarMsg* request,
-                      butil::IOBuf* iobuf, const std::string& out_varname,
-                      bool var_is_not_stable, int trainer_id,
-                      const std::string& table_name) {
-  std::unique_ptr<TensorPayload> payload;
-
-  request->set_varname(name);
-  request->set_trainer_id(trainer_id);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request->set_profile(platform::kEnableProfiler);
-    } else {
-      request->set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_varname.empty()) {
-    request->set_out_varname(out_varname);
-  }
-  if (!table_name.empty()) {
-    request->set_table_name(table_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request->set_type(::sendrecv::LOD_TENSOR);
-    payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request)));
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request->set_type(::sendrecv::SELECTED_ROWS);
-    payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  } else if (var->IsType<ncclUniqueId>()) {
-    request->set_type(::sendrecv::NCCL_ID);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    // TODO(gongwb): use append_zero to avoid data copy.
-    IOBufWriter::Append(name, iobuf,
-                        sendrecv::VariableMessage::kSerializedFieldNumber,
-                        uid.internal, NCCL_UNIQUE_ID_BYTES);
-    return;
-#endif
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Serialize does not support type: %s", typeid(var->Type()).name()));
-  }
-
-  PADDLE_ENFORCE_NOT_NULL(
-      payload,
-      platform::errors::InvalidArgument(
-          "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS.",
-          var->Type()));
-
-  // FIXME(gongwb): it seems that can use zero copy.
-  if (var_is_not_stable) {
-    IOBufWriter::Append(
-        name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-        static_cast<const char*>(payload->ptr()), payload->memory_size());
-  } else {
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      IOBufWriter::AppendZeroCopy(
-          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-          static_cast<const char*>(payload->ptr()), payload->memory_size(),
-          true, SerializeDestroyCallback, static_cast<void*>(payload.get()));
-      payload.release();
-#endif
-    } else {
-      IOBufWriter::AppendZeroCopy(
-          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
-          static_cast<const char*>(payload->ptr()), payload->memory_size(),
-          false, SerializeDestroyCallback, static_cast<void*>(payload.get()));
-      payload.release();
-    }
-  }
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(),
-                      platform::errors::InvalidArgument(
-                          "Got wrong type: %s, expect type: int64_t",
-                          VectorElemName(slr->rows())));
-    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
-
-    IOBufWriter::Append(name, iobuf,
-                        ::sendrecv::VariableMessage::kRowsFieldNumber,
-                        reinterpret_cast<const char*>(slr->rows().data()),
-                        static_cast<int64_t>(rows_memory_size));
-  }
-}
-
-void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta,
-                          const butil::IOBuf& iobuf,
-                          const platform::DeviceContext& ctx,
-                          const framework::Scope* scope,
-                          framework::Variable** var, int* trainer_id) {
-  operators::distributed::BRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE_EQ(
-      resp.Parse(iobuf, meta), 0,
-      platform::errors::InvalidArgument("parse iobuf to tensor error!"));
-  *var = resp.GetVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
deleted file mode 100644
index a5bdc331eb29c7c0fe00d7f346025426b51e1cb3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sys/time.h>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void SerializeToIOBuf(const std::string& name, framework::Variable* var,
-                      const platform::DeviceContext& ctx, VarMsg* request,
-                      butil::IOBuf* iobuf, const std::string& out_varname,
-                      bool var_is_not_stable, const int trainer_id = 0,
-                      const std::string& table_name = std::string());
-
-void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf,
-                          const platform::DeviceContext& ctx,
-                          const framework::Scope* scope,
-                          framework::Variable** var, int* trainer_id);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
deleted file mode 100644
index bcf20ad076b11f86870d8fbe980b8adad5c96ea8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "brpc/channel.h"
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  butil::IOBuf iobuf;
-  sendrecv::VariableMessage msg;
-  int tensor_numel = 564 * 128;
-
-  // serialize var to IOBuf
-  {
-    framework::Variable var;
-    auto* slr = var.GetMutable<framework::SelectedRows>();
-    slr->set_height(1000);
-    auto* tensor = slr->mutable_value();
-    auto* rows = slr->mutable_rows();
-    tensor->Resize(framework::make_ddim({564, 128}));
-    tensor->mutable_data<float>(place);
-    math::set_constant(ctx, tensor, 32.7);
-    for (int i = 0; i < 564; ++i) rows->push_back(i);
-
-    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
-                                             "", false);
-  }
-
-  // desrialize
-  {
-    framework::Scope scope;
-    scope.Var("myvar");
-    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
-    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
-
-    framework::Variable* var2 = resp.GetVar();
-
-    auto* slr2 = var2->GetMutable<framework::SelectedRows>();
-    auto* tensor2 = slr2->mutable_value();
-    auto* rows2 = slr2->mutable_rows();
-    float* tensor_data2 = nullptr;
-    framework::Tensor tmp_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      platform::CPUPlace cpu;
-      framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-      tensor_data2 = tmp_tensor.data<float>();
-    } else {
-      tensor_data2 = const_cast<float*>(tensor2->data<float>());
-    }
-    const int64_t* rows_data2 = rows2->data();
-
-    for (int i = 0; i < tensor_numel; ++i) {
-      EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-    }
-    for (size_t i = 0; i < rows2->size(); ++i) {
-      EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
-    }
-    EXPECT_EQ(slr2->height(), 1000);
-  }
-}
-
-void RunTestLodTensor(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  butil::IOBuf iobuf;
-  sendrecv::VariableMessage msg;
-  int tensor_numel = 512 * 8 * 4 * 2;
-  {
-    framework::Variable var;
-    auto* tensor = var.GetMutable<framework::LoDTensor>();
-    tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
-    framework::LoD lod;
-    lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-    tensor->set_lod(lod);
-    tensor->mutable_data<float>(place);
-    math::set_constant(ctx, tensor, 31.9);
-
-    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
-                                             "", false);
-  }
-
-  // check sendrecv::VariableMessage meta data
-  {
-    EXPECT_EQ(msg.varname(), "myvar");
-    EXPECT_EQ(msg.type(), 0);
-    EXPECT_EQ(msg.dims()[0], 512);
-    EXPECT_EQ(msg.dims()[1], 8);
-    EXPECT_EQ(msg.dims()[2], 4);
-    EXPECT_EQ(msg.dims()[3], 2);
-    EXPECT_EQ(msg.lod_level(), 1);
-    EXPECT_EQ(msg.lod(0).lod_data(0), 1);
-    EXPECT_EQ(msg.lod(0).lod_data(1), 3);
-    EXPECT_EQ(msg.lod(0).lod_data(2), 8);
-  }
-
-  // deserialize
-  {
-    framework::Scope scope;
-    scope.Var("myvar");
-    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
-    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
-
-    framework::Variable* var2 = resp.GetVar();
-
-    auto tensor2 = var2->Get<framework::LoDTensor>();
-    float* tensor_data2 = nullptr;
-    framework::Tensor tmp_tensor;
-
-    if (platform::is_gpu_place(ctx.GetPlace())) {
-      platform::CPUPlace cpu;
-      framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-      tensor_data2 = tmp_tensor.data<float>();
-    } else {
-      tensor_data2 = const_cast<float*>(tensor2.data<float>());
-    }
-
-    for (int i = 0; i < tensor_numel; ++i)
-      EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-  }
-}
-
-TEST(LodTensor, Run) {
-  platform::CPUPlace place;
-  RunTestLodTensor(place);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(gpu);
-#endif
-}
-
-TEST(SelectedRows, Run) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(gpu);
-#endif
-}
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc
deleted file mode 100644
index 5ca26f006bf20e667aaad55baf844387dbd31020..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc
+++ /dev/null
@@ -1,417 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
-#include <memory>
-#include <unordered_map>
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace sendrecv {
-
-namespace distributed = paddle::operators::distributed;
-
-typedef std::unordered_map<std::string, distributed::RequestHandler*>
-    HandlerMap;
-
-class BRPCServiceImpl : public SendRecvService {
- public:
-  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map,
-                           distributed::RPCServer* rpc_server)
-      : rpc_server_(rpc_server) {
-    VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size();
-    auto it = rpc_call_map.find(distributed::kRequestSend);
-    if (it != rpc_call_map.end()) {
-      request_send_h_ = it->second;
-      send_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestSend)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGet);
-    if (it != rpc_call_map.end()) {
-      request_get_h_ = it->second;
-      get_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestGet)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetNoBarrier);
-    if (it != rpc_call_map.end()) {
-      request_getnobarrier_h_ = it->second;
-      getnobarrier_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestPrefetch);
-    if (it != rpc_call_map.end()) {
-      request_prefetch_h_ = it->second;
-      prefetch_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestCheckpoint);
-    if (it != rpc_call_map.end()) {
-      request_checkpoint_h_ = it->second;
-      checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool(
-          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetMonomerVariable);
-    if (it != rpc_call_map.end()) {
-      request_get_monomer_handler_h_ = it->second;
-    }
-
-    it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier);
-    if (it != rpc_call_map.end()) {
-      request_get_monomer_barrier_handler_h_ = it->second;
-    }
-  }
-
-  virtual ~BRPCServiceImpl() {}
-  void SendVariable(google::protobuf::RpcController* cntl_butil,
-                    const VariableMessage* request, VoidMessage* response,
-                    google::protobuf::Closure* done) override {
-    send_threads_->Run(
-        [=] { _SendVariable(cntl_butil, request, response, done); });
-  }
-
-  void _SendVariable(google::protobuf::RpcController* cntl_butil,
-                     const VariableMessage* request, VoidMessage* response,
-                     google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_send_h_, platform::errors::PreconditionNotMet(
-                             "RequestSend handler should be registed first!"));
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    VLOG(3) << "RequestSend var_name:" << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    distributed::BRPCVariableResponse resp(request_send_h_->scope(),
-                                           request_send_h_->dev_ctx(),
-                                           request_send_h_->distributed_mode());
-    PADDLE_ENFORCE_EQ(
-        resp.Parse(cntl->request_attachment(), *request), 0,
-        platform::errors::InvalidArgument("parse iobuf to tensor error!"));
-
-    auto scope = resp.GetMutableLocalScope();
-    auto invar = resp.GetVar();
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id);
-  }
-
-  void GetVariable(google::protobuf::RpcController* cntl_butil,
-                   const VariableMessage* request, VariableMessage* response,
-                   google::protobuf::Closure* done) override {
-    get_threads_->Run(
-        [=] { _GetVariable(cntl_butil, request, response, done); });
-  }
-
-  void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
-                            const VariableMessage* request,
-                            VariableMessage* response,
-                            google::protobuf::Closure* done) override {
-    getnobarrier_threads_->Run(
-        [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); });
-  }
-
-  void _GetVariable(google::protobuf::RpcController* cntl_butil,
-                    const VariableMessage* request, VariableMessage* response,
-                    google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_get_h_, platform::errors::PreconditionNotMet(
-                            "RequestGet handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    std::string out_varname = request->out_varname();
-    VLOG(3) << "RequestGet varname:" << varname
-            << ", out_varname:" << out_varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    auto scope = request_get_h_->scope();
-    paddle::framework::Variable* invar = nullptr;
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id,
-                           out_varname);
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(out_varname, outvar,
-                                    *request_get_h_->dev_ctx(), response,
-                                    &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil,
-                             const VariableMessage* request,
-                             VariableMessage* response,
-                             google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_getnobarrier_h_,
-        platform::errors::PreconditionNotMet(
-            "RequestGetNoBarrier handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    std::string out_varname = request->out_varname();
-    int trainer_id = request->trainer_id();
-
-    VLOG(3) << "RequestGetNoBarrier varname:" << varname
-            << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id
-            << ", from:" << cntl->remote_side();
-
-    auto scope = request_getnobarrier_h_->scope();
-    paddle::framework::Variable* invar = nullptr;
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id,
-                                    out_varname);
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(
-          out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response,
-          &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
-                        const VariableMessage* request,
-                        VariableMessage* response,
-                        google::protobuf::Closure* done) override {
-    prefetch_threads_->Run(
-        [=] { _PrefetchVariable(cntl_butil, request, response, done); });
-  }
-
-  void _PrefetchVariable(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request,
-                         VariableMessage* response,
-                         google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(request_prefetch_h_,
-                   platform::errors::PreconditionNotMet(
-                       "kRequestPrefetch handler should be registed first!");
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    // prefetch process...
-    std::string in_var_name = request->varname();
-    std::string out_var_name = request->out_varname();
-    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
-            << ", out_var_name: " << out_var_name
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    distributed::BRPCVariableResponse resp(
-        request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true);
-
-    PADDLE_ENFORCE_EQ(resp.Parse(cntl->request_attachment(), *request), 0,
-                   platform::errors::InvalidArgument(
-                       "parse iobuf to tensor error!"));
-
-    auto scope = resp.GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    std::string table_name = request->table_name();
-    int trainer_id = request->trainer_id();
-    paddle::framework::Variable* outvar = scope->Var(out_var_name);
-
-    request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                                out_var_name, table_name);
-
-    distributed::SerializeToIOBuf(out_var_name, outvar,
-                                  *request_prefetch_h_->dev_ctx(), response,
-                                  &cntl->response_attachment(), "", true);
-  }
-
-  void CheckpointNotify(google::protobuf::RpcController* cntl_butil,
-                        const VariableMessage* request, VoidMessage* response,
-                        google::protobuf::Closure* done) override {
-    checkpoint_notify_threads_->Run(
-        [=] { _CheckpointNotify(cntl_butil, request, response, done); });
-  }
-
-  void _CheckpointNotify(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request, VoidMessage* response,
-                         google::protobuf::Closure* done) {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_checkpoint_h_,
-        platform::errors::PreconditionNotMet(
-            "kRequestCheckpointNotify handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(),
-                                           request_checkpoint_h_->dev_ctx());
-
-    auto scope = resp.GetMutableLocalScope();
-
-    std::string checkpoint_notify = request->varname();
-    std::string checkpoint_dir = request->out_varname();
-    int trainer_id = request->trainer_id();
-
-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                                  trainer_id, checkpoint_dir);
-  }
-
-  void GetMonomerVariable(google::protobuf::RpcController* cntl_butil,
-                          const VariableMessage* request,
-                          VariableMessage* response,
-                          google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_get_monomer_handler_h_,
-        platform::errors::PreconditionNotMet(
-            "kRequestGetMonomerVariable handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    // proc request.
-    std::string varname = request->varname();
-    VLOG(3) << "GetMonomerVariable " << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    rpc_server_->WaitVarCond(varname);
-    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    auto scope = h.scope_;
-    auto invar = scope->FindVar(varname);
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar,
-                                           request->trainer_id());
-
-    if (outvar) {
-      distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response,
-                                    &cntl->response_attachment(), "", false);
-    }
-  }
-
-  void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil,
-                         const VariableMessage* request, VoidMessage* response,
-                         google::protobuf::Closure* done) override {
-    PADDLE_ENFORCE_NOT_NULL(
-        request_get_monomer_barrier_handler_h_,
-        platform::errors::PreconditionNotMet(
-            "RequestGetMonomerBarrier handler should be registed first!"));
-
-    brpc::ClosureGuard done_guard(done);
-    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
-
-    std::string varname = request->varname();
-    VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname
-            << ", trainer_id:" << request->trainer_id()
-            << ", from:" << cntl->remote_side();
-
-    rpc_server_->WaitVarCond(varname);
-    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    paddle::framework::Scope* scope = nullptr;
-    paddle::framework::Variable* invar = nullptr;
-    paddle::framework::Variable* outvar = nullptr;
-
-    request_get_monomer_barrier_handler_h_->Handle(
-        varname, scope, invar, &outvar, request->trainer_id());
-  }
-
- private:
-  distributed::RequestHandler* request_send_h_{nullptr};
-  distributed::RequestHandler* request_get_h_{nullptr};
-  distributed::RequestHandler* request_getnobarrier_h_{nullptr};
-  distributed::RequestHandler* request_prefetch_h_{nullptr};
-  distributed::RequestHandler* request_checkpoint_h_{nullptr};
-  distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
-  distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr};
-
-  distributed::RPCServer* rpc_server_{nullptr};
-
-  // FIXME(gongwb): brpc should support process one rpc use one threadpool.
-  std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> getnobarrier_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
-  std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
-};
-}  // namespace sendrecv
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void AsyncBRPCServer::StartServer() {
-  // Instance of your service.
-  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this);
-
-  // Add the service into server. Notice the second parameter, because the
-  // service is put on stack, we don't want server to delete it, otherwise
-  // use brpc::SERVER_OWNS_SERVICE.
-  if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
-    PADDDLE_THROW(platform::errors::Unavailable(
-        "Failed to add service into BRPC server."));
-    return;
-  }
-
-  brpc::ServerOptions options;
-#ifdef PADDLE_WITH_BRPC_RDMA
-  options.use_rdma = true;
-#endif
-  options.idle_timeout_sec = idle_timeout_s_;
-  options.max_concurrency = max_concurrency_;
-  if (server_.Start(bind_address_.c_str(), &options) != 0) {
-    PADDDLE_THROW(platform::errors::Unavailable(
-        "Failed to start EchoServer %s.", bind_address_));
-    return;
-  }
-
-  butil::EndPoint ep = server_.listen_address();
-  selected_port_ = ep.port;
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  server_.Join();
-}
-
-void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); }
-
-void AsyncBRPCServer::WaitServerReady() {
-  VLOG(3) << "AsyncGRPCServer is wait server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(3) << "AsyncGRPCServer WaitSeverReady";
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h
deleted file mode 100644
index 78bbe5adc0813d7cf29963c78947d52bcaea9643..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_server.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <mutex>               // NOLINT
-#include <string>
-
-#include "brpc/server.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class AsyncBRPCServer final : public RPCServer {
- public:
-  explicit AsyncBRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncBRPCServer() {}
-  void StartServer() override;
-  void WaitServerReady() override;
-
- private:
-  void ShutDownImpl() override;
-
-  brpc::Server server_;
-
-  static constexpr int idle_timeout_s_ = -1;
-  static constexpr int max_concurrency_ = 0;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-  int ready_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
deleted file mode 100644
index 49521e8a77057bf496eca3249230a3fbc278c262..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-namespace pb = ::google::protobuf;
-using vr = ::sendrecv::VariableMessage;
-
-int BRPCVariableResponse::Parse(Source* source) {
-  pb::io::ZeroCopyInputStream* input_stream = source->contents();
-  pb::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (1) {
-    unsigned int tag = 0;
-    if (!input.ReadLittleEndian32(&tag)) {
-      break;
-    }
-
-    uint64_t num_bytes = 0;
-    if (!input.ReadLittleEndian64(&num_bytes)) {
-      break;
-    }
-
-    int field = static_cast<int>(tag);
-    int ret = field == 0 ? -1 : field;
-    switch (field) {
-      case vr::kSerializedFieldNumber: {
-        if (!ProcSerializedField(field, &input, num_bytes)) {
-          return ret;
-        }
-        break;
-      }
-      case vr::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       platform::errors::PreconditionNotMet(
-                           "meta info should be got first!"));
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return ret;
-        }
-        break;
-      }
-      default: {
-        PADDLE_THROW(platform::errors::Unavailable(
-            "not surpported %u fieldnumber", field));
-        return ret;
-      }
-    }
-  }
-
-  return 0;
-}
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
deleted file mode 100644
index 6282f08a725367f74dbcf1fa6a2ad49469d64725..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "brpc/channel.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class BRPCSourceWrapper : public Source {
- public:
-  explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {}
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    return &source_;
-  }
-
- private:
-  butil::IOBufAsZeroCopyInputStream source_;
-};
-
-class BRPCVariableResponse : public VariableResponse {
- public:
-  BRPCVariableResponse(const framework::Scope* scope,
-                       const platform::DeviceContext* dev_ctx,
-                       bool create_scope = false)
-      : VariableResponse(scope, dev_ctx, create_scope) {}
-
-  virtual ~BRPCVariableResponse() {}
-
-  // parse attachment from iobuf
-  int Parse(Source* source) override;
-  int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) {
-    BRPCSourceWrapper wrapper(iobuf);
-    return VariableResponse::Parse(&wrapper, meta);
-  }
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc
deleted file mode 100644
index fcd3e6abead510393736a9253af2ae1068357a68..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_client.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include <memory>
-#include "gflags/gflags.h"
-
-DECLARE_int32(rpc_deadline);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-std::once_flag CollectiveClient::init_flag_;
-std::unique_ptr<CollectiveClient> CollectiveClient::client_(nullptr);
-
-bool CollectiveClient::Gather(const std::vector<RemoteVar>& remote_vars,
-                              std::vector<const framework::SelectedRows*>* dst,
-                              const platform::DeviceContext& ctx,
-                              framework::Scope* scope, int64_t time_out) {
-  for (auto r : remote_vars) {
-    VLOG(50) << "begin gather from ep:" << r.String();
-    scope->Var(r.var_name_)->GetMutable<framework::SelectedRows>();
-    VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable(
-        r.ep_, ctx, *scope, r.var_name_, time_out);
-  }
-
-  rpc_client_->Wait();
-
-  for (auto r : remote_vars) {
-    auto select_rows =
-        scope->FindVar(r.var_name_)->GetMutable<framework::SelectedRows>();
-    dst->push_back(select_rows);
-
-    VLOG(4) << "gather from ep:" << r.String()
-            << ", select_rows:" << GetSelectedRowsInfo(*select_rows);
-
-    rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_);
-  }
-
-  rpc_client_->Wait();
-  return true;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h
deleted file mode 100644
index e7d8bb8df9834728682ea131f3ef0d60786908e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_client.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class SelectedRows;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-DECLARE_int32(rpc_deadline);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) {
-  std::stringstream ss;
-  ss << ", height:" << slr.height() << ", rows:[";
-  for (unsigned int i = 0; i < slr.rows().size(); i++) {
-    if (i != slr.rows().size() - 1) {
-      ss << slr.rows()[i] << ",";
-    } else {
-      ss << slr.rows()[i];
-    }
-  }
-  ss << "], dims:" << slr.value().dims();
-  return ss.str();
-}
-
-struct RemoteVar {
-  std::string ep_;
-  std::string var_name_;
-  int trainer_id_{0};
-
-  std::string String() {
-    std::stringstream ss;
-    ss << "ep:" << ep_ << ", var_name:" << var_name_
-       << ", trainer_id:" << trainer_id_;
-
-    return ss.str();
-  }
-};
-
-class CollectiveClient {
- public:
-  CollectiveClient() {
-    rpc_client_.reset(new RPCCLIENT_T());
-    rpc_client_->InitImpl();
-  }
-  virtual ~CollectiveClient() {}
-
-  // note this function will retain the rank order.
-  bool Gather(const std::vector<RemoteVar>& remote_vars,
-              std::vector<const framework::SelectedRows*>* dst,
-              const platform::DeviceContext& ctx, framework::Scope* scope,
-              int64_t time_out = FLAGS_rpc_deadline);
-
-  static CollectiveClient* GetInstance() {
-    std::call_once(init_flag_, [&]() {
-      if (client_.get() == nullptr) {
-        client_.reset(new CollectiveClient());
-      }
-    });
-    return client_.get();
-  }
-
- private:
-  std::unique_ptr<RPCClient> rpc_client_;
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<CollectiveClient> client_;
-};
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc
deleted file mode 100644
index cdd37742d2d5a5a882320cbff1e67a353b4af5f8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_server.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed/collective_server.h"
-#include <memory>
-
-DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag CollectiveServer::init_flag_;
-std::shared_ptr<CollectiveServer> CollectiveServer::collective_server_(nullptr);
-
-CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) {
-  VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in;
-  rpc_server_.reset(new RPCSERVER_T(end_point, fan_in));
-}
-
-void CollectiveServer::Stop() {
-  rpc_server_->ShutDown();
-  server_thread_->join();
-  loop_thread_->join();
-}
-
-void CollectiveServer::StartServer() {
-  get_monomer_handler_.reset(new GetMonomerHandler());
-  get_monomer_handler_->SetRPCServer(rpc_server_.get());
-
-  get_barrier_handler_.reset(new GetMonomerBarrierHandler());
-  get_barrier_handler_->SetRPCServer(rpc_server_.get());
-
-  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable,
-                           get_monomer_handler_.get(),
-                           FLAGS_collective_get_thread_num);
-  rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier,
-                           get_barrier_handler_.get(), 1);
-
-  server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); }));
-  rpc_server_->WaitServerReady();
-
-  loop_thread_.reset(new std::thread([&]() {
-    while (true) {
-      if (rpc_server_->IsExit()) {
-        LOG(WARNING) << "get exit!rpc_processor break!";
-        break;
-      }
-      sleep(1);
-    }
-    VLOG(1) << "CollectiveServer loop_thread end";
-  }));
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h
deleted file mode 100644
index 4964923286094abe01559b715a20eeb8da7e2a0d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_server.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class CollectiveServer;
-
-class GetMonomerHandler final : public RequestHandler {
- public:
-  GetMonomerHandler() : RequestHandler(true) {}
-  virtual ~GetMonomerHandler() {}
-  bool Handle(const std::string& var_name, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override {
-    VLOG(50) << "GetMonomerHandler recv " << var_name;
-
-    *outvar = scope->FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        outvar, platform::errors::NotFound("var: %s is not found.", var_name));
-
-    return true;
-  }
-};
-
-class GetMonomerBarrierHandler final : public RequestHandler {
- public:
-  GetMonomerBarrierHandler() : RequestHandler(true) {}
-  virtual ~GetMonomerBarrierHandler() {}
-  bool Handle(const std::string& var_name, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override {
-    VLOG(50) << "GetMonomerHandler recv " << var_name;
-
-    rpc_server_->IncreaseVarBarrier(var_name);
-
-    return true;
-  }
-};
-
-class CollectiveServer final {
- public:
-  explicit CollectiveServer(const std::string& end_point, int fan_in);
-
-  virtual ~CollectiveServer() {}
-
-  void StartServer();
-
-  static CollectiveServer* GetInstance(const std::string& end_point,
-                                       int fan_in) {
-    std::call_once(init_flag_, [&]() {
-      if (collective_server_.get() == nullptr) {
-        collective_server_.reset(new CollectiveServer(end_point, fan_in));
-        collective_server_->StartServer();
-      }
-    });
-
-    return collective_server_.get();
-  }
-
-  std::shared_ptr<RPCServer> GetRPCServer() { return rpc_server_; }
-
-  void Stop();
-
- private:
-  std::unique_ptr<GetMonomerHandler> get_monomer_handler_;
-  std::unique_ptr<GetMonomerBarrierHandler> get_barrier_handler_;
-
-  std::shared_ptr<distributed::RPCServer> rpc_server_;
-  std::shared_ptr<std::thread> server_thread_;
-  std::shared_ptr<std::thread> loop_thread_;
-
-  bool ready_{false};
-
-  static std::once_flag init_flag_;
-  static std::shared_ptr<CollectiveServer> collective_server_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc
deleted file mode 100644
index 92b2eb4b51e59fec0991712ec4f6d6829b76cfb4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/collective_server_test.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <memory>
-#include <string>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/distributed/collective_client.h"
-#include "paddle/fluid/operators/distributed/collective_server.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace distributed = paddle::operators::distributed;
-
-std::unique_ptr<distributed::CollectiveServer> StartServer(
-    const std::string& ep, int fan_in, framework::Scope* scope,
-    platform::DeviceContext* dev_ctx) {
-  distributed::CollectiveServer* server =
-      distributed::CollectiveServer::GetInstance(ep, fan_in);
-
-  auto rpc_server = server->GetRPCServer();
-  rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable,
-                          scope, dev_ctx);
-
-  std::cout << "StartServer return" << std::endl;
-  return std::unique_ptr<distributed::CollectiveServer>(server);
-}
-
-std::unique_ptr<framework::Scope> GenerateVars(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  framework::Scope* scope = new framework::Scope();
-  framework::Variable* var = scope->Var("var1");
-  auto* slr = var->GetMutable<framework::SelectedRows>();
-  slr->set_height(20000);
-
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-
-  tensor->Resize(framework::make_ddim({3, 1024}));
-  tensor->mutable_data<float>(place);
-
-  paddle::operators::math::set_constant(ctx, tensor, 32.7);
-  for (int i = 0; i < 3; ++i) rows->push_back(i);
-
-  std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr);
-
-  return std::unique_ptr<framework::Scope>(scope);
-}
-
-void Gather(const std::vector<distributed::RemoteVar>& vars,
-            platform::DeviceContext* dev_ctx) {
-  distributed::CollectiveClient* client =
-      distributed::CollectiveClient::GetInstance();
-
-  framework::Scope* scope = new framework::Scope();
-  framework::Variable* var = scope->Var("var1");
-  var->GetMutable<framework::SelectedRows>();
-
-  std::vector<const framework::SelectedRows*> dst;
-  client->Gather(vars, &dst, *dev_ctx, scope);
-  std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]);
-  dev_ctx->Wait();
-
-  ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024}));
-  ASSERT_EQ(dst[0]->height(), 20000);
-  ASSERT_EQ(dst[0]->rows().size(), static_cast<size_t>(3));
-  for (int i = 0; i < 3; i++) {
-    ASSERT_EQ(dst[0]->rows()[i], i);
-  }
-
-  std::vector<float> vec;
-  TensorToVector(dst[0]->value(), *dev_ctx, &vec);
-  for (size_t i = 0; i < 3 * 1024; i++) {
-    ASSERT_FLOAT_EQ(vec[i], 32.7);
-  }
-}
-
-TEST(CollectiveServer, GPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-
-  platform::CUDAPlace place;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  std::string ep = "127.0.0.1:7164";
-  auto scope = GenerateVars(place);
-
-  auto* v1 = scope->FindVar("var1");
-  std::cout << "var1:" << v1 << std::endl;
-
-  auto server = StartServer(ep, 2, scope.get(), &ctx);
-  auto rpc_server = server->GetRPCServer();
-
-  distributed::RemoteVar var;
-  var.ep_ = ep;
-  var.var_name_ = "var1";
-  var.trainer_id_ = 0;
-
-  std::vector<distributed::RemoteVar> vars{var};
-  Gather(vars, &ctx);
-  Gather(vars, &ctx);
-
-  std::cout << "begin WaitVarBarrier" << std::endl;
-  rpc_server->WaitVarBarrier("var1");
-  rpc_server->ClearRegisteredVars();
-  server->Stop();
-
-  scope.release();
-  server.release();
-}
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
deleted file mode 100644
index 4ee27a6414698fec7a5483195118551afac55f49..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ /dev/null
@@ -1,989 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-#include <paddle/fluid/framework/program_desc.h>
-
-#include <algorithm>
-#include <chrono>  // NOLINT
-#include <map>
-#include <thread>  // NOLINT
-#include <unordered_set>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/split.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using Tree =
-    std::map<std::string, std::map<std::string, std::vector<std::string>>>;
-using RpcCtxMap = operators::distributed::RpcCtxMap;
-
-inline double GetCurrentUS() {
-  struct timeval time;
-  gettimeofday(&time, NULL);
-  return 1e+6 * time.tv_sec + time.tv_usec;
-}
-
-Communicator::Communicator() {}
-
-std::once_flag Communicator::init_flag_;
-std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
-
-void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                                 const RpcCtxMap &recv_varname_to_ctx,
-                                 Scope *recv_scope) {
-  send_varname_to_ctx_ = std::move(send_varname_to_ctx);
-  recv_varname_to_ctx_ = std::move(recv_varname_to_ctx);
-  recv_scope_ = std::move(recv_scope);
-
-  if (send_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be send, will not start send_thread";
-  } else {
-    send_scope_.reset(new Scope());
-    for (auto &iter : send_varname_to_ctx_) {
-      if (iter.first == STEP_COUNTER && !need_global_step_) continue;
-      send_varname_to_queue_[iter.first] =
-          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
-              send_queue_size_);
-    }
-    send_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-  }
-
-  if (recv_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be received, will not start recv_thread";
-  } else {
-    recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-  }
-
-  InitParams();
-}
-
-void AsyncCommunicator::InitParams() { RecvNoBarrier(); }
-
-AsyncCommunicator::~AsyncCommunicator() {
-  running_ = false;
-  if (main_thread_) main_thread_->join();
-}
-
-void AsyncCommunicator::SendGlobalStep(int batches) {
-  if (!need_global_step_) {
-    return;
-  }
-
-  if (batches == 0) {
-    return;
-  }
-
-  auto &var_name = STEP_COUNTER;
-  auto *out_var = send_scope_->Var(var_name);
-  auto *out_t = out_var->GetMutable<framework::LoDTensor>();
-  auto *data = out_t->mutable_data<int64_t>({1}, platform::CPUPlace());
-  data[0] = static_cast<int64_t>(batches);
-
-  auto &ctx = send_varname_to_ctx_.at(var_name);
-  auto send_functor = distributed::ParameterSend<float>();
-  send_functor(ctx, *send_scope_, true, 1);
-}
-
-void AsyncCommunicator::SendByCommunicator() {
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(send_varname_to_ctx_.size());
-  VLOG(3) << "run send graph";
-
-  auto before_run_send_graph = GetCurrentUS();
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    auto &var_queue = iter.second;
-
-    auto send_task = [this, &var_name, &var_queue] {
-      VLOG(3) << var_name << " merge and send; ";
-      std::vector<std::shared_ptr<Variable>> vars;
-
-      int merged_var_num = 0;
-      int wait_times = 0;
-      while (merged_var_num < max_merge_var_num_) {
-        if (var_queue->Size() == 0) {
-          VLOG(4) << "wait_times -> " << wait_times;
-          if (wait_times >= send_wait_times_) {
-            break;
-          }
-          std::this_thread::sleep_for(std::chrono::milliseconds(10));
-          wait_times++;
-          continue;
-        } else {
-          wait_times = 0;
-
-          vars.push_back(var_queue->Pop());
-          merged_var_num++;
-        }
-      }
-      auto before_merge = GetCurrentUS();
-      if (var_name == STEP_COUNTER) {
-        SendGlobalStep(merged_var_num);
-        auto after_merge = GetCurrentUS();
-        VLOG(3) << "merge and send " << merged_var_num << " " << var_name
-                << " use time " << after_merge - before_merge;
-        return;
-      }
-
-      auto &ctx = send_varname_to_ctx_.at(var_name);
-
-      MergeVars<float>(var_name, vars, send_scope_.get(), ctx.merge_add);
-      auto after_merge = GetCurrentUS();
-      VLOG(3) << "merge " << merged_var_num << " " << var_name << " use time "
-              << after_merge - before_merge;
-
-      auto send_functor = distributed::ParameterSend<float>();
-      send_functor(ctx, *send_scope_, true, 1);
-      auto after_send = GetCurrentUS();
-      VLOG(3) << "send " << var_name << " use time "
-              << after_send - after_merge;
-
-      if (var_name.rfind("@GRAD") != var_name.size() - 5) return;
-
-      auto recv_param = var_name.substr(0, var_name.size() - 5);
-      if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end())
-        return;
-
-      auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_);
-      auto after_recv = GetCurrentUS();
-      VLOG(3) << "recv " << recv_param << " use time "
-              << after_recv - after_send;
-    };
-    task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task)));
-  }
-  for (auto &task_f : task_futures) {
-    task_f.wait();
-  }
-  auto after_run_send_graph = GetCurrentUS();
-
-  VLOG(3) << "run send graph use time "
-          << (after_run_send_graph - before_run_send_graph);
-}
-
-void HalfAsyncCommunicator::SendByCommunicator() {
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(send_varname_to_ctx_.size());
-  VLOG(3) << "run send graph";
-
-  int batches = BatchesCounter();
-  if (batches <= 0) return;
-
-  auto before_run_send_graph = GetCurrentUS();
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    auto &var_queue = iter.second;
-
-    auto send_task = [this, batches, &var_name, &var_queue] {
-      VLOG(3) << var_name << " merge and send; ";
-      auto before_task = GetCurrentUS();
-      std::vector<std::shared_ptr<Variable>> vars;
-      vars.reserve(batches);
-
-      for (int i = 0; i < batches; ++i) {
-        vars.push_back(var_queue->Pop());
-      }
-
-      if (var_name == STEP_COUNTER) {
-        SendGlobalStep(batches);
-        auto end_task = GetCurrentUS();
-        VLOG(3) << "merge " << batches << " " << var_name << " use time "
-                << end_task - before_task;
-        return;
-      }
-
-      auto &ctx = send_varname_to_ctx_.at(var_name);
-
-      auto before_merge = GetCurrentUS();
-      MergeVars<float>(var_name, vars, send_scope_.get(), ctx.merge_add);
-      auto after_merge = GetCurrentUS();
-      VLOG(3) << "merge " << batches << " " << var_name << " use time "
-              << after_merge - before_merge;
-
-      auto send_functor = distributed::ParameterSend<float>();
-      send_functor(ctx, *send_scope_, true, 1);
-      auto after_send = GetCurrentUS();
-      VLOG(3) << "send " << var_name << " use time "
-              << after_send - before_task;
-
-      if (var_name.rfind("@GRAD") != var_name.size() - 5) return;
-
-      auto recv_param = var_name.substr(0, var_name.size() - 5);
-      if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end())
-        return;
-
-      auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_);
-      auto after_recv = GetCurrentUS();
-      VLOG(3) << "recv " << recv_param << " use time "
-              << after_recv - after_send;
-      return;
-    };
-    task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task)));
-  }
-  for (auto &task_f : task_futures) {
-    task_f.wait();
-  }
-  auto after_run_send_graph = GetCurrentUS();
-
-  VLOG(3) << "run send graph use time "
-          << (after_run_send_graph - before_run_send_graph);
-}
-
-void AsyncCommunicator::MainThread() {
-  VLOG(3) << "MainThread start and wait";
-
-  while (waiting_ && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    VLOG(3) << "wait for running";
-  }
-
-  while (running_) {
-    SendByCommunicator();
-    BarrierSend();
-  }
-  VLOG(3) << "communicator stopped, send thread exit";
-}
-
-void HalfAsyncCommunicator::MainThread() {
-  VLOG(3) << "MainThread start and wait";
-
-  while (waiting_ && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    VLOG(3) << "wait for running";
-  }
-
-  while (running_) {
-    SendByCommunicator();
-    BarrierSend();
-    RecvByCommunicator();
-    BarrierRecv();
-    BarrierWeakUp();
-  }
-  VLOG(3) << "communicator stopped, send thread exit";
-}
-
-void AsyncCommunicator::RecvByCommunicator() {
-  VLOG(3) << "parallel run recv graph";
-  if (!running_) return;
-  RecvNoBarrier();
-  VLOG(3) << "run recv graph use time";
-}
-
-void AsyncCommunicator::RecvNoBarrier() {
-  std::vector<std::future<void>> task_futures;
-  task_futures.reserve(recv_varname_to_ctx_.size());
-
-  for (auto &iter : recv_varname_to_ctx_) {
-    auto recv_task = [this, &iter] {
-      auto before_task = GetCurrentUS();
-      auto &var_name = iter.first;
-      auto recv_functor = distributed::ParameterRecv<float>();
-      recv_functor(iter.second, *recv_scope_);
-      auto end_task = GetCurrentUS();
-      VLOG(1) << "recv var " << var_name << " use time "
-              << (end_task - before_task);
-    };
-    task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task)));
-  }
-
-  for (auto &task : task_futures) {
-    task.wait();
-  }
-}
-
-void AsyncCommunicator::Start() {
-  VLOG(3) << "Communicator start";
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    VLOG(3) << "start send thread and recv thread";
-    waiting_ = true;
-    running_ = true;
-    BarrierTriggerReset(max_merge_var_num_);
-    // start send and recv thread
-    main_thread_.reset(
-        new std::thread(std::bind(&AsyncCommunicator::MainThread, this)));
-  }
-}
-
-void AsyncCommunicator::Stop() {
-  VLOG(3) << "Communicator stop";
-  running_ = false;
-  if (!communicator_) {
-    VLOG(0) << "Communicator is not inited, do nothing";
-  } else {
-    if (main_thread_) {
-      VLOG(3) << "stop send thread";
-      main_thread_->join();
-      main_thread_.reset(nullptr);
-    }
-  }
-  VLOG(3) << "Communicator stop done";
-}
-
-void AsyncCommunicator::Send(const std::vector<std::string> &var_names,
-                             const std::vector<std::string> &var_tables,
-                             const framework::Scope &scope) {
-  waiting_ = false;
-
-  PADDLE_ENFORCE_EQ(
-      var_tables.size(), 1,
-      platform::errors::InvalidArgument("var_tables.size() == 1 is permitted"));
-
-  auto table_name = var_tables[0];
-
-  if (table_name == STEP_COUNTER && !need_global_step_) return;
-
-  auto before_send_op = GetCurrentUS();
-  auto &queue = send_varname_to_queue_.at(table_name);
-
-  if (table_name == STEP_COUNTER) {
-    auto tmp_var = std::make_shared<Variable>();
-    auto *tensor = tmp_var->GetMutable<framework::LoDTensor>();
-    tensor->Resize(framework::make_ddim({1}));
-    auto *out_d = tensor->mutable_data<int64_t>(platform::CPUPlace());
-    out_d[0] = 1;
-    queue->Push(tmp_var);
-  } else {
-    PADDLE_ENFORCE_GE(var_names.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "var_names.size() >= 1 is permitted"));
-
-    auto *var = scope.FindVar(var_names[0]);
-
-    PADDLE_ENFORCE_EQ(
-        var->IsInitialized(), true,
-        platform::errors::InvalidArgument("grad var should be inited"));
-
-    auto tmp_var = std::make_shared<Variable>();
-    if (var->IsType<framework::SelectedRows>()) {
-      framework::CopyVariable(*var, tmp_var.get());
-      queue->Push(tmp_var);
-    } else if (var->IsType<framework::LoDTensor>()) {
-      // push var into send queue by var_name
-      auto var_name = var_names[0];
-      framework::CopyVariable(*var, tmp_var.get());
-      queue->Push(tmp_var);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "unknown var type to copy, only support LoDTensor/SelectedRows"));
-    }
-  }
-  auto after_send_op = GetCurrentUS();
-  VLOG(3) << "send to " << table_name << " with queue size " << queue->Size()
-          << ", use time " << (after_send_op - before_send_op);
-}
-
-void HalfAsyncCommunicator::Clean() {
-  for (auto &iter : send_varname_to_queue_) {
-    auto &var_name = iter.first;
-    auto &var_queue = iter.second;
-
-    while (var_queue->Size() > 0) {
-      var_queue->Pop();
-    }
-
-    VLOG(3) << "clean var: " << var_name << " done";
-  }
-}
-
-int HalfAsyncCommunicator::BatchesCounter() {
-  while (running_) {
-    if (barrier_counter_.load() >= barrier_trigger_.load() &&
-        barrier_trigger_.load() != 0) {
-      break;
-    } else {
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
-    }
-  }
-
-  return barrier_counter_.load();
-}
-
-void HalfAsyncCommunicator::Barrier() {
-  barrier_counter_++;
-
-  if (!running_) {
-    VLOG(3) << "Communicator is not running, release barrier";
-    return;
-  }
-
-  {
-    std::unique_lock<std::mutex> lk(barrier_mutex_);
-    barrier_cond_.wait(lk, [this] { return (barrier_counter_ == 0); });
-  }
-}
-
-void HalfAsyncCommunicator::BarrierTriggerDecrement() {
-  barrier_trigger_--;
-  VLOG(3) << "BarrierTriggerDecrement decrement barrier trigger to "
-          << barrier_trigger_.load();
-}
-
-void HalfAsyncCommunicator::BarrierTriggerReset(int initial_val) {
-  barrier_trigger_.store(initial_val);
-
-  VLOG(3) << "BarrierTriggerReset reset barrier trigger to "
-          << barrier_trigger_.load();
-}
-
-void HalfAsyncCommunicator::BarrierWeakUp() {
-  barrier_counter_.store(0);
-  barrier_cond_.notify_all();
-}
-
-void SyncCommunicator::BarrierSend() {
-  if (!running_) return;
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id_);
-
-  std::vector<distributed::VarHandlePtr> rets;
-
-  for (auto &ep : pserver_endpoints_) {
-    rets.push_back(rpc_client->AsyncSendBatchBarrier(ep));
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External(
-                                               "internal error in RPCClient"));
-  }
-
-  VLOG(4) << "BarrierSend with SyncCommunicator";
-}
-
-void SyncCommunicator::BarrierRecv() {
-  if (!running_) return;
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id_);
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (auto &ep : pserver_endpoints_) {
-    rets.push_back(rpc_client->AsyncSendFetchBarrier(ep));
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External(
-                                               "internal error in RPCClient"));
-  }
-
-  VLOG(4) << "BarrierRecv with SyncCommunicator";
-}
-
-void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                               const RpcCtxMap &recv_varname_to_ctx,
-                               Scope *recv_scope) {
-  send_varname_to_ctx_ = std::move(send_varname_to_ctx);
-  recv_varname_to_ctx_ = std::move(recv_varname_to_ctx);
-  recv_scope_ = std::move(recv_scope);
-
-  PADDLE_ENFORCE_GT(
-      send_varname_to_ctx.size(), 0,
-      platform::errors::InvalidArgument("send var contexts can not be zero"));
-
-  send_scope_.reset(new Scope());
-  for (auto &iter : send_varname_to_ctx_) {
-    auto &varname = iter.first;
-
-    if (varname == STEP_COUNTER) {
-      send_varname_to_queue_[varname] =
-          std::make_shared<BlockingQueue<std::shared_ptr<Variable>>>(
-              send_queue_size_);
-    } else {
-      auto &send_ctx = iter.second;
-
-      send_var_nums_ += send_ctx.splited_varnames.size();
-      if (!send_ctx.is_sparse) {
-        continue;
-      }
-      int pserver_num = static_cast<int>(send_ctx.epmap.size());
-      for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) {
-        sparse_id_queues_.insert(
-            std::pair<std::string, std::shared_ptr<BlockingQueue<
-                                       std::shared_ptr<std::vector<int64_t>>>>>(
-                send_ctx.splited_varnames[ep_idx],
-                std::make_shared<
-                    BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>(
-                    send_queue_size_)));
-      }
-    }
-  }
-  send_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-
-  if (recv_varname_to_ctx.size() == 0) {
-    VLOG(0) << "nothing need to be received, will not start recv_thread";
-  } else {
-    recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_));
-  }
-
-  delta_scope_.reset(new Scope());
-  old_scope_.reset(new Scope());
-  pserver_scope_.reset(new Scope());
-
-  InitParams();
-}
-
-void GeoCommunicator::Send(const std::vector<std::string> &var_names,
-                           const std::vector<std::string> &var_tables,
-                           const framework::Scope &scope) {
-  waiting_ = false;
-  PADDLE_ENFORCE_EQ(
-      var_tables.size(), 1,
-      platform::errors::InvalidArgument("var_tables.size() == 1 is permitted"));
-
-  auto table_name = var_tables[0];
-  if (table_name == STEP_COUNTER) return;
-
-  auto before_send = GetCurrentUS();
-  size_t splited_var_nums =
-      send_varname_to_ctx_[table_name].splited_varnames.size();
-
-  std::unordered_map<std::string, std::unordered_set<int64_t>> ids_table;
-
-  for (size_t j = 0; j < splited_var_nums; j++) {
-    ids_table.insert(std::pair<std::string, std::unordered_set<int64_t>>(
-        send_varname_to_ctx_[table_name].splited_varnames[j],
-        std::unordered_set<int64_t>()));
-  }
-  auto *var = scope.FindVar(var_names[0]);
-  auto &rows = var->Get<framework::SelectedRows>().rows();
-
-  // insert ids which has not been record
-  for (size_t j = 0; j < rows.size(); j++) {
-    auto ep_idx = rows[j] % splited_var_nums;
-    ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx])
-        .insert(rows[j]);
-  }
-
-  auto before_push = GetCurrentUS();
-  for (auto &iter : ids_table) {
-    auto &key = iter.first;
-    auto &sparse_ids_set = iter.second;
-    auto sparse_ids_vec = std::make_shared<std::vector<int64_t>>();
-    sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end());
-    sparse_id_queues_.at(key)->Push(sparse_ids_vec);
-    VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key
-            << "'s queue";
-  }
-  auto after_send = GetCurrentUS();
-  VLOG(3) << "run send " << table_name << " op finish. using "
-          << (before_push - before_send) << "; " << (after_send - before_push);
-}
-
-void GeoCommunicator::MainThread() {
-  VLOG(3) << "MainThread start and wait";
-
-  while (waiting_ && running_) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    VLOG(3) << "wait for running";
-  }
-
-  while (running_) {
-    std::vector<std::future<void>> tasks;
-    tasks.reserve(send_var_nums_);
-
-    for (auto &iter : send_varname_to_ctx_) {
-      auto &var_name = iter.first;
-      auto &send_ctx = iter.second;
-      int pserver_num = static_cast<int>(send_ctx.epmap.size());
-      if (send_ctx.is_sparse) {
-        for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) {
-          auto send_recv_task = [this, ep_idx, &var_name] {
-            auto before_send_sparse = GetCurrentUS();
-            if (var_name == STEP_COUNTER) {
-              return;
-            }
-            auto send_varname =
-                send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx];
-            auto sparse_ids = MergeSparseIds(send_varname);
-            if (sparse_ids.size() == 0) {
-              return;
-            }
-            SendSparse(var_name, ep_idx, sparse_ids);
-            auto after_send_sparse = GetCurrentUS();
-            RecvSparse(var_name, ep_idx);
-            auto after_recv_sparse = GetCurrentUS();
-            VLOG(3)
-                << "send recv "
-                << send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx]
-                << " finish, using " << (after_send_sparse - before_send_sparse)
-                << " and " << (after_recv_sparse - after_send_sparse)
-                << "; total = " << (after_recv_sparse - before_send_sparse);
-          };
-          tasks.emplace_back(
-              send_threadpool_->enqueue(std::move(send_recv_task)));
-        }
-      } else {
-        auto send_recv_task = [this, &var_name, &send_ctx] {
-          if (var_name == STEP_COUNTER) {
-            return;
-          }
-          SendDense(var_name);
-          RecvDense(var_name);
-        };
-        tasks.emplace_back(
-            send_threadpool_->enqueue(std::move(send_recv_task)));
-      }
-    }
-    for (auto &task : tasks) {
-      task.wait();
-    }
-  }
-}
-
-std::vector<int64_t> GeoCommunicator::MergeSparseIds(
-    const std::string &send_varname) {
-  size_t merge_num = 0, wait_times = 0;
-  std::unordered_set<int64_t> sparse_ids;
-  while (merge_num < static_cast<size_t>(max_merge_var_num_)) {
-    VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num;
-    if (sparse_id_queues_.at(send_varname)->Size() > 0) {
-      wait_times = 0;
-      std::shared_ptr<std::vector<int64_t>> pop_ids =
-          sparse_id_queues_.at(send_varname)->Pop();
-      for (size_t j = 0; j < pop_ids->size(); j++) {
-        sparse_ids.insert(pop_ids->at(j));
-      }
-      merge_num += 1;
-      VLOG(3) << "sparse_id_queues_(" << send_varname << ") pushed";
-    } else if (sparse_id_queues_.at(send_varname)->Size() == 0) {
-      VLOG(3) << "wait_times -> " << wait_times;
-      if (wait_times >= static_cast<size_t>(send_wait_times_)) {
-        break;
-      }
-      std::this_thread::sleep_for(std::chrono::milliseconds(10));
-      wait_times++;
-      continue;
-    }
-  }
-  std::vector<int64_t> res;
-  res.assign(sparse_ids.begin(), sparse_ids.end());
-  return res;
-}
-void GeoCommunicator::SendSparse(const std::string &varname, int ep_idx,
-                                 const std::vector<int64_t> &sparse_ids) {
-  auto &rpc_ctx = send_varname_to_ctx_.at(varname);
-  auto send_varname = rpc_ctx.splited_varnames[ep_idx];
-  auto trainer_id = rpc_ctx.trainer_id;
-  auto endpoint = rpc_ctx.epmap[ep_idx];
-  auto pserver_num = rpc_ctx.epmap.size();
-
-  auto *var_latest = recv_scope_->FindVar(varname);
-
-  PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true,
-                    platform::errors::Unavailable(
-                        "%s is not initialized, please check", varname));
-  auto &t_latest = var_latest->Get<framework::LoDTensor>();
-
-  auto dims1 = t_latest.dims()[1];
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto *var_delta = delta_scope_->Var(send_varname);
-  auto *t_delta = var_delta->GetMutable<framework::SelectedRows>();
-
-  auto *t_value = t_delta->mutable_value();
-  t_value->mutable_data<float>(
-      framework::make_ddim({static_cast<int64_t>(sparse_ids.size()), dims1}),
-      cpu_ctx.GetPlace());
-
-  std::vector<std::vector<std::vector<float> *>> values;
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Get(sparse_ids, {"Param"}, &values);
-
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-  float coefficient = 1.0 / static_cast<float>(trainers_);
-
-  for (auto j = 0; j < static_cast<int>(sparse_ids.size()); ++j) {
-    blas.VSUB(dims1, t_latest.data<float>() + sparse_ids[j] * dims1,
-              values[j][0]->data(), t_value->data<float>() + j * dims1);
-    blas.SCAL(dims1, coefficient, t_value->data<float>() + j * dims1);
-    blas.VADD(dims1, values[j][0]->data(), t_value->data<float>() + j * dims1,
-              values[j][0]->data());
-  }
-
-  std::vector<int64_t> send_rows;
-  send_rows.reserve(sparse_ids.size());
-  for (auto idx : sparse_ids) {
-    send_rows.push_back(idx / pserver_num);
-  }
-  t_delta->set_height(rpc_ctx.height_sections[ep_idx]);
-  t_delta->set_rows(send_rows);
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx_send = *pool.Get(platform::CPUPlace());
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-  auto ret = rpc_client->AsyncSendVar(endpoint, cpu_ctx_send,
-                                      *delta_scope_.get(), send_varname);
-  ret->Wait();
-}
-
-void GeoCommunicator::SendDense(const std::string &varname) {
-  auto *var_latest = recv_scope_->FindVar(varname);
-  auto *var_timestamp = old_scope_->FindVar(varname);
-
-  PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true,
-                    platform::errors::Unavailable(
-                        "%s is not initialized, please check", varname));
-  PADDLE_ENFORCE_EQ(var_timestamp->IsInitialized(), true,
-                    platform::errors::Unavailable(
-                        "%s is not initialized, please check", varname));
-
-  auto &t_latest = var_latest->Get<framework::LoDTensor>();
-  auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto *var_delta = delta_scope_->Var(varname);
-  auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
-  t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
-
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-  blas.VSUB(t_latest.numel(), t_latest.data<float>(),
-            t_timestamp->data<float>(), t_delta->data<float>());
-
-  float coefficient = 1.0 / static_cast<float>(trainers_);
-  blas.SCAL(t_latest.numel(), coefficient, t_delta->data<float>());
-
-  blas.VADD(t_latest.numel(), t_timestamp->data<float>(),
-            t_delta->data<float>(), t_timestamp->data<float>());
-
-  auto &ctx = send_varname_to_ctx_.at(varname);
-  auto send = distributed::ParameterSend<float>();
-  send(ctx, *delta_scope_, true, 1);
-}
-
-void GeoCommunicator::RecvByCommunicator() { return; }
-
-void GeoCommunicator::RecvSparse(const std::string &varname, int ep_idx) {
-  auto train_id = recv_varname_to_ctx_.at(varname).trainer_id;
-  auto endpoint = recv_varname_to_ctx_.at(varname).epmap[ep_idx];
-  auto splited_var_name =
-      recv_varname_to_ctx_.at(varname).splited_varnames[ep_idx];
-  auto pserver_num = recv_varname_to_ctx_.at(varname).epmap.size();
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx_recv = *pool.Get(platform::CPUPlace());
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(train_id);
-
-  auto *var_psrever = pserver_scope_->Var(splited_var_name);
-  auto handle = rpc_client->AsyncGetVar(endpoint, cpu_ctx_recv,
-                                        *pserver_scope_.get(), splited_var_name,
-                                        splited_var_name, splited_var_name);
-  handle->Wait();
-
-  auto *var_latest = recv_scope_->FindVar(varname);
-
-  PADDLE_ENFORCE_EQ(
-      var_psrever->IsInitialized(), true,
-      platform::errors::Unavailable(
-          "%s in pserver scope is not initialized, please check", varname));
-
-  std::vector<int64_t> ids;
-  ids.assign(var_psrever->Get<framework::SelectedRows>().rows().begin(),
-             var_psrever->Get<framework::SelectedRows>().rows().end());
-
-  for (size_t j = 0; j < ids.size(); j++) {
-    ids[j] = ids[j] * pserver_num + ep_idx;
-  }
-
-  VLOG(3) << "RecvSparse receive var: " << splited_var_name
-          << " ids Size: " << ids.size();
-
-  auto t_psrever = var_psrever->Get<framework::SelectedRows>().value();
-
-  std::vector<std::vector<std::vector<float> *>> old_values;
-
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Get(ids, {"Param"}, &old_values);
-
-  auto *t_latest = var_latest->GetMutable<framework::LoDTensor>();
-
-  auto dims1 = t_latest->dims()[1];
-  auto numel = ids.size() * dims1;
-
-  std::vector<float> v_delta;
-  v_delta.resize(numel);
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-
-  for (auto j = 0; j < static_cast<int>(ids.size()); ++j) {
-    blas.VSUB(dims1, t_psrever.data<float>() + j * dims1,
-              old_values[j][0]->data(), v_delta.data() + j * dims1);
-    blas.VADD(dims1, t_latest->data<float>() + ids[j] * dims1,
-              v_delta.data() + j * dims1,
-              t_latest->data<float>() + ids[j] * dims1);
-    blas.VCOPY(dims1, t_psrever.data<float>() + j * dims1,
-               old_values[j][0]->data());
-  }
-}
-
-void GeoCommunicator::RecvDense(const std::string &varname) {
-  auto *var_latest = recv_scope_->FindVar(varname);
-  auto *var_timestamp = old_scope_->FindVar(varname);
-  auto *var_psrever = pserver_scope_->Var(varname);
-
-  auto &ctx = recv_varname_to_ctx_.at(varname);
-  auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *pserver_scope_);
-
-  PADDLE_ENFORCE_EQ(
-      var_psrever->IsInitialized(), true,
-      platform::errors::Unavailable(
-          "%s in pserver scope is not initialized, please check", varname));
-
-  auto t_psrever = var_psrever->Get<framework::LoDTensor>();
-  auto t_latest = var_latest->GetMutable<framework::LoDTensor>();
-  auto t_timestamp = var_timestamp->GetMutable<framework::LoDTensor>();
-
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto *var_delta = delta_scope_->Var(varname);
-  auto *t_delta = var_delta->GetMutable<framework::LoDTensor>();
-  t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());
-
-  auto blas = math::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-  blas.VSUB(t_latest->numel(), t_psrever.data<float>(),
-            t_timestamp->data<float>(), t_delta->data<float>());
-  blas.VADD(t_latest->numel(), t_latest->data<float>(), t_delta->data<float>(),
-            t_latest->data<float>());
-  blas.VCOPY(t_latest->numel(), t_psrever.data<float>(),
-             t_timestamp->data<float>());
-}
-
-void GeoCommunicator::InitParams() {
-  std::vector<std::future<void>> tasks;
-  tasks.reserve(recv_varname_to_ctx_.size());
-
-  for (auto &iter : recv_varname_to_ctx_) {
-    auto &var_name = iter.first;
-    auto &recv_ctx = iter.second;
-
-    auto recv_task = [this, &var_name, &recv_ctx] {
-      if (!recv_ctx.is_sparse) {
-        InitDense(var_name);
-      }
-    };
-    tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task)));
-  }
-
-  for (auto &task : tasks) {
-    task.wait();
-  }
-  InitSparse();
-}
-
-void GeoCommunicator::InitDense(const std::string varname) {
-  auto &ctx = recv_varname_to_ctx_.at(varname);
-  auto recv = distributed::ParameterRecv<float>();
-  recv(ctx, *recv_scope_);
-
-  auto *global_var = recv_scope_->FindVar(varname);
-  global_var->GetMutable<framework::LoDTensor>();
-
-  auto *old_var = old_scope_->Var(varname);
-  old_var->GetMutable<framework::LoDTensor>();
-
-  framework::CopyVariable(*global_var, old_var);
-  VLOG(1) << "init dense variable " << varname << " done";
-}
-
-void GeoCommunicator::InitSparse() {
-  auto sparse_metas = string::split_string<std::string>(sparse_attrs_, "#");
-
-  std::vector<distributed::SparseMeta> metas;
-  std::vector<int64_t> dicts;
-
-  for (auto &sparse_meta : sparse_metas) {
-    auto attrs = string::split_string<std::string>(sparse_meta, ":");
-
-    auto meta = distributed::SparseMeta();
-    meta.name = attrs[0];
-    meta.value_names = {"Param"};
-
-    auto dic = string::split_string<std::string>(attrs[1], ",");
-    dicts.push_back(std::stoi(dic[0]));
-    meta.value_dims = {std::stoi(dic[1])};
-    meta.mode = distributed::Mode::training;
-    meta.grad_name = "none";
-    meta.cached_varnames = {};
-    meta.initializer_attrs = string::split_string<std::string>(attrs[2]);
-    meta.entry = "none";
-
-    VLOG(3) << "add sparse meta: " << meta.ToString();
-    metas.push_back(meta);
-  }
-
-  LargeScaleKV::Init(metas);
-
-  for (auto &meta : metas) {
-    auto &ctx = recv_varname_to_ctx_.at(meta.name);
-    auto recv = distributed::ParameterRecv<float>();
-
-    auto *global_var = recv_scope_->FindVar(meta.name);
-    auto global_value = global_var->Get<framework::LoDTensor>();
-    auto rows = global_value.dims()[0];
-    auto dim1 = global_value.dims()[1];
-
-    recv(ctx, *recv_scope_);
-    VLOG(1) << "recv " << meta.name << " with global scope for init";
-
-    auto n_rows = global_var->Get<framework::LoDTensor>().dims()[0];
-
-    PADDLE_ENFORCE_EQ(
-        rows, n_rows,
-        platform::errors::InvalidArgument(
-            "global var: %s origin dim must equal recved rows", meta.name));
-
-    std::vector<int64_t> ids(rows);
-    std::iota(ids.begin(), ids.end(), 0);
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    std::vector<std::vector<std::vector<float> *>> values;
-
-    ins->Get(meta.name)->Init(ids);
-    ins->Get(meta.name)->Get(ids, {"Param"}, &values);
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, float>(
-        paddle::platform::CPUDeviceContext());
-
-    for (auto &id : ids) {
-      blas.VCOPY(dim1, global_value.data<float>() + id * dim1,
-                 values[id][0]->data());
-    }
-  }
-
-  VLOG(3) << "init sparse variable done";
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
deleted file mode 100644
index 4be3253d3923f8536f9bb3d455f9bd6c12d67184..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/communicator.h
+++ /dev/null
@@ -1,490 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <stdint.h>
-#include <atomic>
-#include <deque>
-#include <map>
-#include <memory>
-#include <numeric>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/split.h"
-
-DECLARE_bool(communicator_is_sgd_optimizer);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-
-template <typename T>
-class BlockingQueue {
- public:
-  explicit BlockingQueue(size_t capacity) : capacity_(capacity) {
-    PADDLE_ENFORCE_GT(capacity_, 0,
-                      platform::errors::InvalidArgument(
-                          "The capacity must be greater than 0."));
-  }
-
-  bool Push(const T &elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      PADDLE_ENFORCE_LT(
-          queue_.size(), capacity_,
-          platform::errors::OutOfRange("The queue size: %s out of capacity:%s",
-                                       queue_.size(), capacity_));
-      queue_.push_back(elem);
-    }
-    cv_.notify_one();
-    return true;
-  }
-
-  bool Push(T &&elem) {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      cv_.wait(lock, [&] { return queue_.size() < capacity_; });
-      PADDLE_ENFORCE_LT(
-          queue_.size(), capacity_,
-          platform::errors::OutOfRange("The queue size: %s out of capacity:%s",
-                                       queue_.size(), capacity_));
-      queue_.emplace_back(std::move(elem));
-    }
-    cv_.notify_one();
-    return true;
-  }
-
-  T Pop() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [=] { return !queue_.empty(); });
-    T rc(std::move(queue_.front()));
-    queue_.pop_front();
-    cv_.notify_one();
-    return rc;
-  }
-
-  size_t Cap() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return capacity_;
-  }
-
-  size_t Size() const {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return queue_.size();
-  }
-
- private:
-  const size_t capacity_;
-  std::deque<T> queue_;
-
-  mutable std::mutex mutex_;
-  std::condition_variable cv_;
-};
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-inline void MergeVars(const std::string &var_name,
-                      const std::vector<std::shared_ptr<Variable>> &vars,
-                      Scope *scope, bool merge_add = true) {
-  PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument(
-                                            "vector vars are empty."));
-  auto cpu_place = platform::CPUPlace();
-  auto &var0 = vars[0];
-  auto *out_var = scope->Var(var_name);
-  if (var0->IsType<framework::LoDTensor>()) {
-    auto dims = var0->Get<framework::LoDTensor>().dims();
-    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims
-            << "; merge add: " << merge_add;
-    // init output tensor
-    auto *out_t = out_var->GetMutable<framework::LoDTensor>();
-    out_t->mutable_data<T>(dims, cpu_place);
-    // check the input dims
-    for (auto &var : vars) {
-      auto &var_t = var->Get<framework::LoDTensor>();
-      PADDLE_ENFORCE_EQ(
-          var_t.dims(), dims,
-          platform::errors::InvalidArgument("vars should have the same dims"));
-    }
-
-    // set output tensor to 0.
-    auto cpu_ctx = paddle::platform::CPUDeviceContext();
-    math::SetConstant<paddle::platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(cpu_ctx, out_t, static_cast<T>(0));
-    // sum all vars to out
-    auto result = EigenVector<T>::Flatten(*out_t);
-    for (auto &var : vars) {
-      auto &in_t = var->Get<framework::LoDTensor>();
-      auto in = EigenVector<T>::Flatten(in_t);
-      result.device(*cpu_ctx.eigen_device()) = result + in;
-    }
-    if (!merge_add) {
-      result.device(*cpu_ctx.eigen_device()) =
-          result / static_cast<T>(vars.size());
-    }
-  } else if (var0->IsType<framework::SelectedRows>()) {
-    auto &slr0 = var0->Get<framework::SelectedRows>();
-    auto *out_slr = out_var->GetMutable<framework::SelectedRows>();
-    out_slr->mutable_rows()->clear();
-    out_slr->mutable_value()->mutable_data<T>({{}}, cpu_place);
-    std::vector<const paddle::framework::SelectedRows *> inputs;
-    inputs.reserve(vars.size());
-    for (auto &var : vars) {
-      inputs.push_back(&var->Get<framework::SelectedRows>());
-    }
-    auto dev_ctx = paddle::platform::CPUDeviceContext();
-    if (merge_add) {
-      math::scatter::MergeAdd<paddle::platform::CPUDeviceContext, T> merge_add;
-      merge_add(dev_ctx, inputs, out_slr);
-    } else {
-      math::scatter::MergeAverage<paddle::platform::CPUDeviceContext, T>
-          merge_average;
-      merge_average(dev_ctx, inputs, out_slr);
-    }
-
-    VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height()
-            << " dims: " << slr0.value().dims() << "; merge add: " << merge_add;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument("unsupported var type: %s!",
-                                                   var0->Type()));
-  }
-}
-
-using RpcCtxMap = std::unordered_map<std::string, CommContext>;
-using SparseValue = std::unordered_map<int64_t, std::vector<float>>;
-
-class Communicator {
- public:
-  Communicator();
-
-  explicit Communicator(const std::map<std::string, std::string> &envs_) {
-    for (auto &iter : envs_) {
-      envs[iter.first] = iter.second;
-    }
-  }
-
-  virtual ~Communicator() {}
-
-  virtual void Start() = 0;
-
-  virtual void Stop() = 0;
-
-  virtual bool IsRunning() { return running_; }
-
-  virtual void Clean() {}
-
-  virtual void Send(const std::vector<std::string> &var_names,
-                    const std::vector<std::string> &var_tables,
-                    const framework::Scope &scope) = 0;
-
-  virtual void RecvNoBarrier() {}
-
-  virtual void Barrier() {}
-
-  virtual void BarrierTriggerDecrement() {}
-
-  virtual void BarrierTriggerReset(int init_counter) {}
-
-  virtual void InitEnvs() = 0;
-
-  virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                        const RpcCtxMap &recv_varname_to_ctx,
-                        Scope *recv_scope) {}
-
-  static Communicator *GetInstance() { return communicator_.get(); }
-
-  static std::shared_ptr<Communicator> GetInstantcePtr() {
-    return communicator_;
-  }
-
-  template <typename T>
-  static Communicator *InitInstance(
-      const RpcCtxMap &send_ctx, const RpcCtxMap &recv_ctx, Scope *recv_scope,
-      const std::map<std::string, std::string> &envs) {
-    std::call_once(init_flag_, &Communicator::InitWithRpcCtx<T>, send_ctx,
-                   recv_ctx, recv_scope, std::ref(envs));
-    return communicator_.get();
-  }
-
-  // Init is called by InitInstance.
-  template <typename T>
-  static void InitWithRpcCtx(const RpcCtxMap &send_ctx,
-                             const RpcCtxMap &recv_ctx, Scope *recv_scope,
-                             const std::map<std::string, std::string> &envs) {
-    if (communicator_.get() == nullptr) {
-      communicator_.reset(new T(std::ref(envs)));
-      communicator_->InitEnvs();
-      communicator_->InitImpl(send_ctx, recv_ctx, recv_scope);
-    }
-  }
-
- protected:
-  bool running_ = false;
-  bool waiting_ = true;
-  static std::shared_ptr<Communicator> communicator_;
-  static std::once_flag init_flag_;
-  std::unordered_map<std::string, std::string> envs;
-};
-
-class AsyncCommunicator : public Communicator {
- public:
-  AsyncCommunicator() : Communicator() {}
-
-  explicit AsyncCommunicator(const std::map<std::string, std::string> &envs)
-      : Communicator(envs) {}
-
-  ~AsyncCommunicator();
-
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ =
-        std::stoi(envs.at("communicator_min_send_grad_num_before_recv"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
-    need_global_step_ =
-        static_cast<bool>(std::stoi(envs.at("need_global_step")));
-    VLOG(0) << "AsyncCommunicator Initialized";
-  }
-
-  void Start() override;
-
-  void Stop() override;
-
-  void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                const RpcCtxMap &recv_varname_to_ctx,
-                Scope *recv_scope) override;
-
-  void InitParams();
-
-  virtual void MainThread();
-
-  void Send(const std::vector<std::string> &var_names,
-            const std::vector<std::string> &var_tables,
-            const framework::Scope &scope) override;
-
-  virtual void SendByCommunicator();
-  virtual void SendGlobalStep(int batches);
-
-  virtual void RecvByCommunicator();
-
-  virtual void RecvNoBarrier();
-
-  virtual void BarrierSend() {}
-
-  virtual void BarrierRecv() {}
-
-  virtual void BarrierWeakUp() {}
-
- protected:
-  int min_send_grad_num_before_recv_;
-  int thread_pool_size_;
-  int max_merge_var_num_;
-  int send_wait_times_;
-  int send_queue_size_;
-  int trainer_id_ = 0;
-  bool need_global_step_ = false;
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<BlockingQueue<std::shared_ptr<Variable>>>>
-      send_varname_to_queue_;
-  RpcCtxMap send_varname_to_ctx_;
-  RpcCtxMap recv_varname_to_ctx_;
-  std::unique_ptr<std::thread> main_thread_{nullptr};
-  Scope *recv_scope_;                  // should be global scope
-  std::unique_ptr<Scope> send_scope_;  // an independent scope
-  std::unique_ptr<::ThreadPool> send_threadpool_{nullptr};
-  std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr};
-  std::atomic_uint grad_num_{0};  // the num of gradient sent since last recv
-};
-
-class HalfAsyncCommunicator : public AsyncCommunicator {
- public:
-  HalfAsyncCommunicator() {}
-
-  explicit HalfAsyncCommunicator(const std::map<std::string, std::string> &envs)
-      : AsyncCommunicator(envs) {}
-
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ = 0;
-
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
-    need_global_step_ =
-        static_cast<bool>(std::stoi(envs.at("need_global_step")));
-    VLOG(0) << "HalfAsyncCommunicator Initialized";
-  }
-
-  void MainThread() override;
-
-  void SendByCommunicator() override;
-
-  void Clean() override;
-
-  void Barrier() override;
-
-  void BarrierTriggerDecrement() override;
-
-  void BarrierTriggerReset(int initial_val) override;
-
-  int BatchesCounter();
-
-  void BarrierWeakUp();
-
- protected:
-  // mutex for Wait for barrier
-  std::mutex barrier_mutex_;
-  std::condition_variable barrier_cond_;
-  std::atomic<int64_t> barrier_trigger_{0};
-  std::atomic<int64_t> barrier_counter_{0};
-};
-
-class SyncCommunicator : public HalfAsyncCommunicator {
- public:
-  SyncCommunicator() : HalfAsyncCommunicator() {}
-
-  explicit SyncCommunicator(const std::map<std::string, std::string> &envs)
-      : HalfAsyncCommunicator(envs) {}
-
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ = 0;
-
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-    send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size"));
-    need_global_step_ =
-        static_cast<bool>(std::stoi(envs.at("need_global_step")));
-
-    trainer_id_ = std::stoi(envs.at("trainer_id"));
-    auto pserver_strings = envs.at("pserver_endpoints");
-    pserver_endpoints_ = paddle::string::Split(pserver_strings, ',');
-    VLOG(0) << "SyncCommunicator Initialized";
-  }
-
-  void BarrierSend();
-
-  void BarrierRecv();
-
- private:
-  std::vector<std::string> pserver_endpoints_{};
-};
-
-class GeoCommunicator : public AsyncCommunicator {
- public:
-  GeoCommunicator() : AsyncCommunicator() {}
-
-  explicit GeoCommunicator(const std::map<std::string, std::string> &envs)
-      : AsyncCommunicator(envs) {}
-
-  void InitImpl(const RpcCtxMap &send_varname_to_ctx,
-                const RpcCtxMap &recv_varname_to_ctx,
-                Scope *recv_scope) override;
-  void MainThread() override;
-  void InitEnvs() {
-    min_send_grad_num_before_recv_ = 0;
-
-    max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num"));
-    send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times"));
-    thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size"));
-
-    send_queue_size_ = max_merge_var_num_;
-    trainers_ = std::stoi(envs.at("trainers"));
-    sparse_attrs_ = envs.at("sparse_attrs");
-    VLOG(0) << "GeoCommunicator Initialized";
-  }
-
-  void Send(const std::vector<std::string> &var_names,
-            const std::vector<std::string> &var_tables,
-            const framework::Scope &scope) override;
-
-  void SendByCommunicator() { return; }
-
-  std::vector<int64_t> MergeSparseIds(const std::string &send_varname);
-
-  void SendSparse(const std::string &varname, int ep_idx,
-                  const std::vector<int64_t> &sparse_ids);
-
-  void SendDense(const std::string &varname);
-
-  void SendGlobalStep(int batches) override {}
-
-  void RecvByCommunicator() override;
-
-  void RecvSparse(const std::string &varname, int ep_idx);
-
-  void RecvDense(const std::string &varname);
-
-  void InitParams();
-
-  void InitSparse();
-
-  void InitDense(const std::string varname);
-
- private:
-  int trainers_;
-  std::string sparse_attrs_;
-
-  // parameter for delta calc and send
-  std::shared_ptr<Scope> delta_scope_;
-
-  // parameter for storage the pserver param after last recv
-  std::shared_ptr<Scope> old_scope_;
-
-  // parameter on pserver
-  std::shared_ptr<Scope> pserver_scope_;
-
-  int send_var_nums_ = 0;
-
-  std::unordered_map<std::string, std::shared_ptr<SparseValue>> old_sparses_;
-
-  std::unordered_map<
-      std::string,
-      std::shared_ptr<BlockingQueue<std::shared_ptr<std::vector<int64_t>>>>>
-      sparse_id_queues_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator_common.h b/paddle/fluid/operators/distributed/communicator_common.h
deleted file mode 100644
index 122d904eba27aa86fe333312340788dc0aef0d47..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/communicator_common.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-struct CommContext {
-  CommContext() = default;
-
-  CommContext(const std::string &name, const std::vector<std::string> &names,
-              const std::vector<std::string> &emap,
-              const std::vector<int64_t> &sections,
-              const std::vector<std::string> &origin_names, int id,
-              bool merge_add_ = true, bool is_sparse_ = true,
-              bool is_distributed_ = false)
-      : var_name(name),
-        splited_varnames(names),
-        epmap(emap),
-        height_sections(sections),
-        origin_varnames(origin_names),
-        trainer_id(id),
-        merge_add(merge_add_),
-        is_sparse(is_sparse_),
-        is_distributed(is_distributed_) {}
-
-  CommContext(const CommContext &ctx) {
-    var_name = ctx.var_name;
-    splited_varnames = ctx.splited_varnames;
-    epmap = ctx.epmap;
-    height_sections = ctx.height_sections;
-    trainer_id = ctx.trainer_id;
-    merge_add = ctx.merge_add;
-    is_sparse = ctx.is_sparse;
-    origin_varnames = ctx.origin_varnames;
-    is_distributed = ctx.is_distributed;
-  }
-
-  std::string print() const {
-    std::stringstream ss;
-
-    ss << "varname: " << var_name << " trainer_id: " << trainer_id << " ";
-
-    for (size_t i = 0; i < splited_varnames.size(); i++) {
-      ss << "slice varname: " << splited_varnames[i] << " ep: " << epmap[i]
-         << " section: " << height_sections[i] << " ";
-    }
-
-    ss << "origin varnames: ";
-    for (size_t i = 0; i < origin_varnames.size(); i++) {
-      ss << origin_varnames[i] << " ";
-    }
-
-    ss << " aggregation->add: " << merge_add << " ";
-    ss << " is_sparse: " << is_sparse << "\n";
-    ss << " is_distributed: " << is_distributed << "\n";
-
-    return ss.str();
-  }
-
-  std::string var_name;
-  std::vector<std::string> splited_varnames;
-  std::vector<std::string> epmap;
-  std::vector<int64_t> height_sections;
-  std::vector<std::string> origin_varnames;
-  int trainer_id;
-  bool merge_add;
-  bool is_sparse;
-  bool is_distributed;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc
deleted file mode 100644
index 38b7c8b00317e6880434e975438c72ba9248aee2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/communicator_test.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-TEST(communicator, merge_lod_tensors) {
-  auto cpu_place = platform::CPUPlace();
-  auto dims = framework::make_ddim({2, 3});
-  std::vector<std::shared_ptr<framework::Variable>> in_vars;
-  float out_value = 0;
-  for (auto i = 0; i < 10; ++i) {
-    auto var = std::make_shared<Variable>();
-    in_vars.emplace_back(var);
-    auto *tensor = var->GetMutable<LoDTensor>();
-    auto *data = tensor->mutable_data<float>(dims, cpu_place);
-    for (auto j = 0; j < tensor->numel(); ++j) {
-      data[j] = static_cast<float>(i);
-    }
-    out_value += static_cast<float>(i);
-  }
-  const std::string out_name = "Out";
-  std::unique_ptr<framework::Scope> scope;
-  scope.reset(new framework::Scope());
-  scope->Var(out_name);
-  for (auto i = 0; i < 10; ++i) {
-    MergeVars<float>(out_name, in_vars, scope.get());
-  }
-  auto &out_tensor = scope->FindVar(out_name)->Get<LoDTensor>();
-  auto *out_data = out_tensor.data<float>();
-  ASSERT_EQ(out_tensor.dims(), dims);
-  for (auto i = 0; i < out_tensor.numel(); ++i) {
-    ASSERT_EQ(out_data[i], out_value);
-  }
-}
-
-TEST(communicator, merge_selected_rows) {
-  auto cpu_place = platform::CPUPlace();
-  int64_t width = 10;
-  std::vector<std::shared_ptr<framework::Variable>> in_vars;
-  const int64_t height = 100;
-  for (auto i = 0; i < 10; ++i) {
-    std::vector<int64_t> rows;
-    for (auto k = 0; k <= i; ++k) {
-      rows.push_back(k);
-    }
-    auto var = std::make_shared<Variable>();
-    in_vars.emplace_back(var);
-    auto *slr = var->GetMutable<SelectedRows>();
-    slr->set_height(height);
-    slr->set_rows(rows);
-    auto dims =
-        framework::make_ddim({static_cast<int64_t>(rows.size()), width});
-    auto *data = slr->mutable_value()->mutable_data<float>(dims, cpu_place);
-    for (size_t i = 0; i < rows.size(); ++i) {
-      for (auto j = 0; j < width; ++j) {
-        data[i * width + j] = static_cast<float>(rows[i]);
-      }
-    }
-  }
-  const std::string out_name = "Out";
-  std::unique_ptr<framework::Scope> scope;
-  scope.reset(new framework::Scope());
-  scope->Var(out_name);
-  for (auto i = 0; i < 10; ++i) {
-    MergeVars<float>(out_name, in_vars, scope.get());
-  }
-  auto &out_slr = scope->FindVar(out_name)->Get<SelectedRows>();
-  auto &out_t = out_slr.value();
-  auto *out_data = out_t.data<float>();
-  ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width}));
-  std::vector<float> out_values;
-  out_values.reserve(10);
-  for (auto i = 0; i < 10; ++i) {
-    out_values.push_back(static_cast<float>(i * (10 - i)));
-  }
-  for (size_t i = 0; i < out_slr.rows().size(); ++i) {
-    ASSERT_EQ(out_slr.rows()[i], static_cast<int>(i));
-    for (auto j = 0; j < width; ++j) {
-      ASSERT_EQ(out_data[i * width + j], out_values[i]);
-    }
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/distributed.h b/paddle/fluid/operators/distributed/distributed.h
deleted file mode 100644
index 5917c18fb0d20104738c3d5868d419135f5108be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/distributed.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#ifdef PADDLE_WITH_DISTRIBUTE
-
-#ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/distributed/communicator.h"
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
-#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
-#define RPCCLIENT_T paddle::operators::distributed::GRPCClient
-
-#else  // PADDLE_WITH_GRPC
-
-#include "paddle/fluid/operators/distributed/brpc/brpc_client.h"
-#include "paddle/fluid/operators/distributed/brpc/brpc_server.h"
-#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
-#define RPCCLIENT_T paddle::operators::distributed::BRPCClient
-
-#endif  // PADDLE_WITH_GRPC
-
-#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
deleted file mode 100644
index 7d6756b41363d12af68402817cfee1df408b8827..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-GrpcByteBufferSource::GrpcByteBufferSource() {}
-
-bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
-  cur_ = -1;
-  left_ = 0;
-  ptr_ = nullptr;
-  byte_count_ = 0;
-  bool ok = src.Dump(&slices_).ok();
-  if (!ok) {
-    slices_.clear();
-  }
-  return ok;
-}
-
-bool GrpcByteBufferSource::Next(const void** data, int* size) {
-  // Use loop instead of if in case buffer contained empty slices.
-  while (left_ == 0) {
-    // Advance to next slice.
-    cur_++;
-    if (cur_ >= slices_.size()) {
-      return false;
-    }
-    const ::grpc::Slice& s = slices_[cur_];
-    left_ = s.size();
-    ptr_ = reinterpret_cast<const char*>(s.begin());
-  }
-
-  *data = ptr_;
-  *size = left_;
-  byte_count_ += left_;
-  ptr_ += left_;
-  left_ = 0;
-  return true;
-}
-
-void GrpcByteBufferSource::BackUp(int count) {
-  ptr_ -= count;
-  left_ += count;
-  byte_count_ -= count;
-}
-
-bool GrpcByteBufferSource::Skip(int count) {
-  const void* data;
-  int size;
-  while (Next(&data, &size)) {
-    if (size >= count) {
-      BackUp(size - count);
-      return true;
-    }
-    // size < count;
-    count -= size;
-  }
-  // error or we have too large count;
-  return false;
-}
-
-google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
-  return byte_count_;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
deleted file mode 100644
index 486870de7a554e675bb01492e775654bbcb34da3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <vector>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-struct grpc_byte_buffer;
-
-namespace grpc {
-// A ZeroCopyInputStream that reads from grpc_byte_buffer
-class ByteBuffer;
-
-class GrpcBufferReader final
-    : public ::google::protobuf::io::ZeroCopyInputStream {
-  typedef void (CoreCodegenInterface::*OldReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  typedef int (CoreCodegenInterface::*NewReaderInitAPI)(
-      grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer);
-  void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    (g_core_codegen_interface->*ptr)(reader, buffer);
-  }
-  void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader,
-                  grpc_byte_buffer* buffer) {
-    int result = (g_core_codegen_interface->*ptr)(reader, buffer);
-    (void)result;
-  }
-
- public:
-  explicit GrpcBufferReader(grpc_byte_buffer* buffer)
-      : byte_count_(0), backup_count_(0) {
-    ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_,
-               buffer);
-  }
-  ~GrpcBufferReader() override {
-    g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_);
-  }
-
-  bool Next(const void** data, int* size) override {
-    if (backup_count_ > 0) {
-      *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) -
-              backup_count_;
-      GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX);
-      *size = static_cast<int>(backup_count_);
-      backup_count_ = 0;
-      return true;
-    }
-    if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_,
-                                                                &slice_)) {
-      return false;
-    }
-    g_core_codegen_interface->grpc_slice_unref(slice_);
-    *data = GRPC_SLICE_START_PTR(slice_);
-    // On win x64, int is only 32bit
-    GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX);
-    byte_count_ += * size = static_cast<int>(GRPC_SLICE_LENGTH(slice_));
-    return true;
-  }
-
-  void BackUp(int count) override { backup_count_ = count; }
-
-  bool Skip(int count) override {
-    const void* data;
-    int size;
-    while (Next(&data, &size)) {
-      if (size >= count) {
-        BackUp(size - count);
-        return true;
-      }
-      // size < count;
-      count -= size;
-    }
-    // error or we have too large count;
-    return false;
-  }
-
-  ::google::protobuf::int64 ByteCount() const override {
-    return byte_count_ - backup_count_;
-  }
-
- private:
-  int64_t byte_count_;
-  int64_t backup_count_;
-  grpc_byte_buffer_reader reader_;
-  grpc_slice slice_;
-};
-
-};  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
-class GrpcByteBufferSource
-    : public ::google::protobuf::io::ZeroCopyInputStream {
- public:
-  GrpcByteBufferSource();
-  bool Init(const ::grpc::ByteBuffer& src);  // Can be called multiple times.
-  bool Next(const void** data, int* size) override;
-  void BackUp(int count) override;
-  bool Skip(int count) override;
-  ::google::protobuf::int64 ByteCount() const override;
-
- private:
-  std::vector<::grpc::Slice> slices_;
-  size_t cur_;       // Current slice index.
-  int left_;         // Number of bytes in slices_[cur_] left to yield.
-  const char* ptr_;  // Address of next byte in slices_[cur_] to yield.
-  ::google::protobuf::int64 byte_count_;
-};
-
-class GrpcByteBufferSourceWrapper : public Source {
- public:
-  explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
-      : source_(source) {}
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    return source_;
-  }
-
- private:
-  GrpcByteBufferSource* source_;
-};
-
-class GrpcByteSource : public Source {
- public:
-  explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {}
-  ~GrpcByteSource() override { DeleteStream(); }
-
-  typedef ::grpc::GrpcBufferReader Reader;
-
-  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
-    DeleteStream();
-    stream_ = new (&space_) Reader(buffer_);
-    return stream_;
-  }
-
- private:
-  void DeleteStream() {
-    if (stream_) {
-      stream_->~Reader();
-    }
-  }
-
-  grpc_byte_buffer* buffer_;  // Not owned
-  Reader* stream_ = nullptr;  // Points into space_ if non-nullptr
-  char space_[sizeof(Reader)];
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
deleted file mode 100644
index 97a9c14e4f1850990ca7ade55d0c9ddd83b5fbab..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ /dev/null
@@ -1,671 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <limits>
-
-#include "glog/logging.h"  // For VLOG
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_client.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(rpc_client_threads, 2, "");
-DECLARE_bool(rpc_disable_reuse_port);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void GRPCClient::InitImpl() {
-  // start the client process thread
-  // TODO(wuyi): can make this in a threadpool
-  client_threads_.resize(FLAGS_rpc_client_threads);
-  for (int i = 0; i < FLAGS_rpc_client_threads; i++) {
-    client_threads_[i].reset(
-        new std::thread(std::bind(&GRPCClient::Proceed, this)));
-  }
-}
-
-void GRPCClient::SendComplete() {
-  std::unique_lock<std::mutex> lk(completed_mutex_);
-  if (!completed_) {
-    for (auto& it : channels_) {
-      VLOG(3) << "send complete message to " << it.first;
-      this->AsyncSendComplete(it.first);
-    }
-    PADDLE_ENFORCE_EQ(this->Wait(), true, platform::errors::PreconditionNotMet(
-                                              "internal grpc service error."));
-    completed_ = true;
-  }
-}
-
-GRPCClient::~GRPCClient() {
-  stopped_ = true;
-  Wait();
-  cq_.Shutdown();
-  {
-    std::lock_guard<std::mutex> guard(chan_mutex_);
-    for (auto& it : channels_) {
-      it.second.reset();
-    }
-    channels_.clear();
-  }
-  for (size_t i = 0; i < client_threads_.size(); i++)
-    client_threads_[i]->join();
-}
-
-VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
-                                      const platform::DeviceContext& ctx,
-                                      const framework::Scope& scope,
-                                      const std::string& var_name,
-                                      int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kSendRPC;
-
-  int retry_times_ = 0;
-
-  while (true) {
-    SendProcessor* s = new SendProcessor(ch);
-    VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-
-    framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] {
-      auto* var = p_scope->FindVar(var_name_val);
-
-      ::grpc::ByteBuffer req;
-      SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = nullptr;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call = s->stub_g_.PrepareUnaryCall(
-          s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req,
-          &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-void ProcGetResponse(const VarHandle& var_h,
-                     const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(4) << "ProcGetResponse";
-  framework::Variable* outvar = nullptr;
-  // get response's trainer_id is not used
-  int trainer_id;
-  DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
-                            &trainer_id);
-}
-
-void ProcGetRecvResponse(const VarHandle& var_h,
-                         const ::grpc::ByteBuffer& ret_msg) {
-  VLOG(4) << "ProcGetRecvResponse";
-  framework::Variable* outvar = nullptr;
-  int trainer_id;
-  DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar,
-                                &trainer_id);
-}
-
-template <typename T>
-void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) {
-  ::grpc::Slice slice(proto.ByteSizeLong());
-  proto.SerializeWithCachedSizesToArray(const_cast<uint8_t*>(slice.begin()));
-  ::grpc::ByteBuffer tmp(&slice, 1);
-  result->Swap(&tmp);
-}
-
-VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
-                                     const platform::DeviceContext& ctx,
-                                     const framework::Scope& scope,
-                                     const std::string& var_name,
-                                     const std::string& out_varname,
-                                     const std::string& table_name,
-                                     int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
-                      "/sendrecv.SendRecvService/GetVariable", table_name,
-                      time_out);
-}
-
-VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    const std::string& out_varname, int64_t time_out) {
-  std::string var_name_no_barrier =
-      string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE);
-
-  return _AsyncGetVar(
-      ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
-      "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out);
-}
-
-VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
-                      "/sendrecv.SendRecvService/GetMonomerVariable", "",
-                      time_out);
-}
-
-VarHandlePtr GRPCClient::_AsyncGetVar(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& method,
-    const std::string& var_name, const std::string& out_varname,
-    const std::string& rpc_path, const std::string& table_name,
-    int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const std::string out_varname_val = out_varname;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  int retry_times_ = 0;
-
-  while (true) {
-    GetProcessor* s = new GetProcessor(ch);
-
-    VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-
-    framework::Async([var_name_val, out_varname_val, table_name_val, s, method,
-                      p_ctx, h, rpc_path, this] {
-      // prepare input
-      sendrecv::VariableMessage req;
-      req.set_varname(var_name_val);
-      req.set_out_varname(out_varname_val);
-      req.set_trainer_id(trainer_id_);
-      req.set_table_name(table_name_val);
-      ::grpc::ByteBuffer buf;
-      RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = ProcGetResponse;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call =
-          s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& in_var_name,
-                                          const std::string& out_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string in_var_name_val = in_var_name;
-  const std::string out_var_name_val = out_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-
-  const std::string method = kPrefetchRPC;
-  int retry_times_ = 0;
-
-  while (true) {
-    GetProcessor* s = new GetProcessor(ch);
-    VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
-    s->Prepare(h, kPrefetchTimeout);
-
-    auto* var = p_scope->FindVar(in_var_name_val);
-
-    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val,
-                          0, table_name_val);
-
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-    // stub context
-    s->response_call_back_ = ProcGetResponse;
-
-    platform::RecordRPCEvent record_event(method);
-
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
-        &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, static_cast<void*>(s));
-
-    if (UNLIKELY(platform::IsProfileEnabled())) {
-      h->Wait();
-    }
-
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kBatchBarrierRPC;
-  VarHandlePtr h(
-      new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(BATCH_BARRIER_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  FetchBarrierProcessor* s = new FetchBarrierProcessor(ch);
-  const std::string method = kFetchBarrierRPC;
-  VarHandlePtr h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(FETCH_BARRIER_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
-                                                const std::string& var_name,
-                                                int64_t time_out) {
-  const auto ch = GetChannel(ep);
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kSendMonomerFetchBarrierRPC;
-  VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  VLOG(30) << s->GetVarHandlePtr()->String() << " begin";
-
-  sendrecv::VariableMessage req;
-  req.set_varname(var_name);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
-                                           int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
-  const std::string method = kSendCompleteRPC;
-  VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_trainer_id(trainer_id_);
-  req.set_varname(COMPLETE_MESSAGE);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
-                                               const std::string& dirname,
-                                               const std::string& varname,
-                                               const int mode,
-                                               int64_t time_out) {
-  const auto ch = GetChannel(ep);
-
-  CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch);
-
-  const std::string method = kCheckPointNotifyRPC;
-
-  VarHandlePtr h(
-      new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr));
-  s->Prepare(h, time_out);
-
-  sendrecv::VariableMessage req;
-  req.set_varname(varname);
-  req.set_table_name(std::to_string(mode));
-  req.set_out_varname(dirname);
-
-  platform::RecordRPCEvent record_event(method);
-
-  auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
-  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncDistributeNotify(
-    const std::string& ep, const platform::DeviceContext& ctx,
-    const framework::Scope& scope, const std::string& var_name,
-    int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string var_name_val = var_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kRequestNotify;
-
-  SendProcessor* s = new SendProcessor(ch);
-  VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
-  s->Prepare(h, time_out);
-
-  framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] {
-    auto* var = p_scope->FindVar(var_name_val);
-
-    ::grpc::ByteBuffer req;
-    SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_);
-
-    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-    // stub context
-    s->response_call_back_ = nullptr;
-
-    platform::RecordRPCEvent record_event(method);
-
-    auto call = s->stub_g_.PrepareUnaryCall(
-        s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", req,
-        &cq_);
-    call->StartCall();
-    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-  });
-  req_count_++;
-
-  if (UNLIKELY(platform::IsProfileEnabled())) {
-    h->Wait();
-  }
-
-  return h;
-}
-
-VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep,
-                                          const platform::DeviceContext& ctx,
-                                          const framework::Scope& scope,
-                                          const std::string& send_var_name,
-                                          const std::string& recv_var_name,
-                                          const std::string& table_name,
-                                          int64_t time_out) {
-  const platform::DeviceContext* p_ctx = &ctx;
-  const std::string ep_val = ep;
-  const std::string send_var_name_val = send_var_name;
-  const std::string recv_var_name_val = recv_var_name;
-  const std::string table_name_val = table_name;
-  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
-  const std::string method = kSendAndRecvRPC;
-  VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: "
-          << send_var_name_val << " Recv_var_name: " << recv_var_name_val;
-  int retry_times_ = 0;
-
-  while (true) {
-    SendAndRecvProcessor* s = new SendAndRecvProcessor(ch);
-    VarHandlePtr h(
-        new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope));
-    VarHandlePtr h_recv(
-        new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope));
-    s->Prepare(h, time_out);
-    s->RecvPrepare(h_recv);
-
-    framework::Async([send_var_name_val, recv_var_name_val, table_name_val,
-                      p_scope, p_ctx, s, method, h, this] {
-      auto* send_var = p_scope->FindVar(send_var_name_val);
-      send_var->GetMutable<framework::LoDTensor>()->set_lod({});
-      ::grpc::ByteBuffer buf;
-      VLOG(4) << "SerializeToByteBuffer: send_var_name_val: "
-              << send_var_name_val
-              << " recv_var_name_val: " << recv_var_name_val;
-      SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf,
-                            recv_var_name_val, trainer_id_, table_name_val);
-
-      VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
-
-      // stub context
-      s->response_call_back_ = ProcGetRecvResponse;
-
-      platform::RecordRPCEvent record_event(method);
-
-      auto call = s->stub_g_.PrepareUnaryCall(
-          s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable",
-          buf, &cq_);
-      call->StartCall();
-      call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
-
-      if (UNLIKELY(platform::IsProfileEnabled())) {
-        h->Wait();
-      }
-    });
-    req_count_++;
-
-    if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) {
-      h->Wait();
-      if (h->should_retry) {
-        VLOG(3) << "rpc call failed, retry times " << retry_times_;
-        retry_times_++;
-        std::random_device rd;
-        std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5));
-        continue;
-      }
-    }
-
-    return h;
-  }
-}
-
-bool GRPCClient::Wait() {
-  std::unique_lock<std::mutex> lk(sync_mutex_);
-  sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); });
-  return ok_;
-}
-
-inline bool ShouldRetry(const std::string& method, int error_code) {
-  if (method == kPrefetchRPC) {
-    return true;
-  }
-
-  if (error_code == grpc::StatusCode::DEADLINE_EXCEEDED) {
-    return true;
-  }
-
-  return false;
-}
-
-void GRPCClient::Proceed() {
-  void* tag = nullptr;
-  bool ok = false;
-
-  VLOG(3) << "GRPCClient Proceed begin";
-  while (!stopped_ && cq_.Next(&tag, &ok)) {
-    BaseProcessor* c = static_cast<BaseProcessor*>(tag);
-    GPR_ASSERT(ok);
-    PADDLE_ENFORCE_NOT_NULL(
-        c, platform::errors::PreconditionNotMet("Make BaseProcessor failed."));
-
-    if (c->status_.ok()) {
-      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
-      c->Process();
-    } else if (ShouldRetry(c->GetVarHandlePtr()->method(),
-                           c->status_.error_code())) {
-      VLOG(0) << c->GetVarHandlePtr()->String()
-              << " meets grpc error, error_code:" << c->status_.error_code()
-              << " error_message:" << c->status_.error_message()
-              << " error_details:" << c->status_.error_details()
-              << " should retry!";
-      c->GetVarHandlePtr()->should_retry = true;
-      c->Finish(false);
-    } else {
-      PADDLE_THROW(platform::errors::External(
-          "%s meets grpc error, error_code is %d, error message is %s, error "
-          "details is %s.",
-          c->GetVarHandlePtr()->String(), c->status_.error_code(),
-          c->status_.error_message(), c->status_.error_details()));
-      c->Finish(false);
-    }
-
-    bool notify = false;
-    {
-      std::lock_guard<std::mutex> lk(sync_mutex_);
-      req_count_--;
-      notify = (req_count_ <= 0 || !c->status_.ok());
-    }
-
-    delete c;
-
-    if (notify) {
-      sync_cond_.notify_all();
-    }
-  }
-
-  // Last log message
-  // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a
-  // static Mutex log_mutex is used for synchronization, which might have been
-  // destructed at this moment.
-  if (FLAGS_v >= 3) {
-    std::string msg("GRPCClient Proceed end");
-    fwrite(msg.c_str(), msg.length(), 1, stderr);
-  }
-}
-
-std::shared_ptr<grpc::Channel> GRPCClient::GetChannel(const std::string& ep) {
-  std::lock_guard<std::mutex> guard(chan_mutex_);
-  auto it = channels_.find(ep);
-  if (it != channels_.end()) {
-    return it->second;
-  }
-
-  // Channel configurations:
-  grpc::ChannelArguments args;
-  args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000);
-  if (FLAGS_rpc_disable_reuse_port) {
-    args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
-  }
-  args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE);
-  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-
-  auto ch =
-      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args);
-  channels_[ep] = ch;
-  return ch;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
deleted file mode 100644
index 5885f944b60a15c2b4810a3968f0ee5406a36a70..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ /dev/null
@@ -1,321 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <time.h>
-#include <atomic>
-#include <chrono>              // NOLINT
-#include <condition_variable>  // NOLINT
-#include <ctime>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <vector>
-
-#include "grpc++/channel.h"
-#include "grpc++/generic/generic_stub.h"
-#include "grpc++/grpc++.h"
-#include "grpc++/support/byte_buffer.h"
-#include "grpc++/support/slice.h"
-#include "grpc/support/log.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
-
-namespace grpc {
-class Channel;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
-
-void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
-
-class BaseProcessor {
- public:
-  BaseProcessor() { context_ = nullptr; }
-
-  virtual ~BaseProcessor() {}
-
-  virtual void Prepare(VarHandlePtr h, int64_t time_out) {
-    var_h_ = h;
-
-    context_.reset(new grpc::ClientContext());
-    context_->set_wait_for_ready(true);
-    if (time_out) {
-      std::chrono::system_clock::time_point deadline =
-          std::chrono::system_clock::now() +
-          std::chrono::milliseconds(time_out);
-      context_->set_deadline(deadline);
-    }
-  }
-
-  void Process() {
-    ProcessImpl();
-    var_h_->Finish(true);
-  }
-
-  VarHandlePtr GetVarHandlePtr() { return var_h_; }
-  bool Wait() { return var_h_->Wait(); }
-  void Finish(bool ok) { return var_h_->Finish(ok); }
-  virtual void ProcessImpl() = 0;
-
-  std::unique_ptr<grpc::ClientContext> context_;
-  grpc::Status status_;
-
- protected:
-  VarHandlePtr var_h_;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestSendCallBack;
-
-class SendProcessor : public BaseProcessor {
- public:
-  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~SendProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_.get(), reply_);
-    }
-  }
-
-  ::grpc::GenericStub stub_g_;
-  ::grpc::ByteBuffer reply_;
-  RequestSendCallBack response_call_back_ = nullptr;
-};
-
-typedef std::function<void(const VarHandle&, const ::grpc::ByteBuffer&)>
-    RequestGetCallBack;
-
-class GetProcessor : public BaseProcessor {
- public:
-  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~GetProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_.get(), reply_);
-    }
-  }
-
-  ::grpc::ByteBuffer reply_;
-  ::grpc::GenericStub stub_g_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-};
-
-class SendAndRecvProcessor : public BaseProcessor {
- public:
-  explicit SendAndRecvProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor(), stub_g_(ch) {}
-
-  virtual ~SendAndRecvProcessor() {}
-
-  void ProcessImpl() override {
-    if (response_call_back_) {
-      response_call_back_(*var_h_recv_.get(), reply_);
-      var_h_recv_->Finish(true);
-    }
-  }
-
-  void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; }
-
-  ::grpc::ByteBuffer reply_;
-  ::grpc::GenericStub stub_g_;
-  RequestGetCallBack response_call_back_ = ProcGetResponse;
-  VarHandlePtr var_h_recv_;
-};
-
-class BatchBarrierProcessor : public BaseProcessor {
- public:
-  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~BatchBarrierProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class FetchBarrierProcessor : public BaseProcessor {
- public:
-  explicit FetchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~FetchBarrierProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VariableMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class CheckpointNotifyProcessor : public BaseProcessor {
- public:
-  explicit CheckpointNotifyProcessor(std::shared_ptr<grpc::Channel> ch)
-      : BaseProcessor() {
-    stub_ = sendrecv::SendRecvService::NewStub(ch);
-  }
-
-  virtual ~CheckpointNotifyProcessor() {}
-
-  void ProcessImpl() override {}
-  sendrecv::VoidMessage reply_;
-  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
-};
-
-class GRPCClient : public RPCClient {
- public:
-  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
-  virtual ~GRPCClient();
-
-  VarHandlePtr AsyncSendVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVar(const std::string& ep,
-                           const platform::DeviceContext& ctx,
-                           const framework::Scope& scope,
-                           const std::string& var_name,
-                           const std::string& out_varname,
-                           const std::string& table_name = "",
-                           int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetVarNoBarrier(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_varname,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& in_var_name,
-                                const std::string& out_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendFetchBarrier(const std::string& ep,
-                                     int64_t time_out) override;
-
-  VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dirname,
-      const std::string& varname, const int mode,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncDistributeNotify(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendAndRecv(const std::string& ep,
-                                const platform::DeviceContext& ctx,
-                                const framework::Scope& scope,
-                                const std::string& send_var_name,
-                                const std::string& recv_var_name,
-                                const std::string& table_name = "",
-                                int64_t time_out = FLAGS_rpc_deadline) override;
-
-  VarHandlePtr AsyncSendComplete(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
-
-  bool Wait() override;
-
-  void SendComplete() override;
-
-  void InitImpl() override;
-
- private:
-  void Proceed();
-
-  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
-  VarHandlePtr _AsyncGetVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& method,
-      const std::string& var_name, const std::string& out_varname,
-      const std::string& rpc_path, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline);
-
- private:
-  grpc::CompletionQueue cq_;
-  std::unordered_map<std::string, std::shared_ptr<grpc::Channel>> channels_;
-  std::vector<std::unique_ptr<std::thread>> client_threads_;
-
-  // mutex for Wait client sync
-  std::mutex sync_mutex_;
-  std::condition_variable sync_cond_;
-  std::atomic<int64_t> req_count_{0};
-  bool ok_;
-
-  // mutex for GetChannel thread safety
-  std::mutex chan_mutex_;
-  DISABLE_COPY_AND_ASSIGN(GRPCClient);
-
-  // mutex for sending complete message only once
-  std::mutex completed_mutex_;
-  bool completed_;
-
-  volatile bool stopped_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
deleted file mode 100644
index 0fc9b6957791490192a142321792b82cf906bfc9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#endif
-#ifdef PADDLE_WITH_RCCL
-#include <rccl.h>
-#endif
-#include <limits>
-#include <memory>
-#include "grpcpp/impl/codegen/byte_buffer.h"
-#include "grpcpp/impl/codegen/slice.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class Variable;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg, const std::string& out_name,
-                           const int trainer_id,
-                           const std::string& table_name) {
-  platform::RecordRPCEvent record_event("serial");
-  VarMsg request;
-  TensorPayload* payload = nullptr;
-
-  request.set_varname(name);
-  request.set_trainer_id(trainer_id);
-  // Note: normally the profiler is enabled in 1 trainer, hence only
-  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
-  // servers the trainer's profiling state so that PS can follow the
-  // trainer.
-  if (platform::ShouldSendProfileState()) {
-    if (platform::IsProfileEnabled()) {
-      request.set_profile(platform::kEnableProfiler);
-    } else {
-      request.set_profile(platform::kDisableProfiler);
-    }
-  }
-  if (!out_name.empty()) {
-    request.set_out_varname(out_name);
-  }
-  if (!table_name.empty()) {
-    request.set_table_name(table_name);
-  }
-  if (var->IsType<framework::LoDTensor>()) {
-    request.set_type(::sendrecv::LOD_TENSOR);
-    payload = new TensorPayload(GetTensorPayload(var, ctx, &request));
-  } else if (var->IsType<framework::SelectedRows>()) {
-    request.set_type(::sendrecv::SELECTED_ROWS);
-    payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  } else if (var->IsType<ncclUniqueId>()) {
-    request.set_type(::sendrecv::NCCL_ID);
-#endif
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Serialize does not support type: %s", typeid(var->Type()).name()));
-  }
-  std::string header;
-  request.AppendToString(&header);
-  auto buffer = std::unique_ptr<char[]>(new char[1024]);
-  void* buf = buffer.get();
-  ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
-  e.WriteRawBytes(std::string(header.data(), header.size()));
-// NCCLID is copied directly to the message, return bytebuffer
-// with only one slice if serializing NCCLID.
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (var->IsType<ncclUniqueId>()) {
-    e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                              NCCL_UNIQUE_ID_BYTES);
-    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
-    e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
-
-    // for serialize NCCL_ID
-    ::grpc::Slice slices(e.size());
-    memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
-    ::grpc::ByteBuffer tmp(&slices, 1);
-    msg->Swap(&tmp);
-    return;
-  }
-#endif
-  PADDLE_ENFORCE_NOT_NULL(
-      payload,
-      platform::errors::InvalidArgument(
-          "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS",
-          var->Type()));
-  e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
-                            payload->memory_size());
-  if (payload->memory_size() >= std::numeric_limits<int>::max()) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Variable %s length %d should less than %d.", name,
-        payload->memory_size(), std::numeric_limits<int>::max()));
-  }
-  // steal reference of tensor data
-  ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
-  int num_slices = 2;       // only SelectedRows have rows buffer
-  slices[0] = ::grpc::Slice(e.size());
-  memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
-  slices[1] = ::grpc::Slice(
-      grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(),
-                                    SerializeDestroyCallback, payload),
-      ::grpc::Slice::STEAL_REF);
-
-  if (var->IsType<framework::SelectedRows>()) {
-    auto* slr = var->GetMutable<framework::SelectedRows>();
-    ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
-
-    PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(),
-                      platform::errors::InvalidArgument(
-                          "Got wrong type %s, expect type: int64_t",
-                          VectorElemName(slr->rows())));
-    size_t rows_memory_size = slr->rows().size() * sizeof(int64_t);
-
-    e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
-    slices[2] = ::grpc::Slice(e2.size());
-    memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
-
-    slices[3] = ::grpc::Slice(
-        grpc_slice_new_with_user_data(
-            const_cast<void*>(
-                reinterpret_cast<const void*>(slr->rows().data())),
-            rows_memory_size, [](void* backing) {},
-            const_cast<char*>(
-                reinterpret_cast<const char*>(slr->rows().data()))),
-        ::grpc::Slice::STEAL_REF);
-    num_slices = 4;
-  }
-  ::grpc::ByteBuffer tmp(&slices[0], num_slices);
-  msg->Swap(&tmp);
-}
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial");
-  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE_EQ(
-      resp.Parse(msg), 0,
-      platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
-  *var = resp.GetVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope* scope,
-                                   framework::Variable** var, int* trainer_id) {
-  platform::RecordRPCEvent record_event("deserial");
-  operators::distributed::GRPCVariableResponse resp(scope, &ctx);
-  PADDLE_ENFORCE_EQ(
-      resp.Parse(msg), 0,
-      platform::errors::InvalidArgument("parse bytebuffer to tensor error!"));
-  *var = resp.GetRecvVar();
-  *trainer_id = resp.GetTrainerId();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h
deleted file mode 100644
index 932f3e2f069a2bfe1dec9318446e1bf064d2e317..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Scope;
-class Variable;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-typedef void (*DestroyCallback)(void*);
-
-void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
-                           const platform::DeviceContext& ctx,
-                           ::grpc::ByteBuffer* msg,
-                           const std::string& out_varname = std::string(),
-                           const int trainer_id = 0,
-                           const std::string& table_name = std::string());
-
-void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                               const platform::DeviceContext& ctx,
-                               const framework::Scope* scope,
-                               framework::Variable** var, int* trainer_id);
-
-void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope* scope,
-                                   framework::Variable** var, int* trainer_id);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
deleted file mode 100644
index d407a72938a741d9451f46ba067e639e3ada6544..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "google/protobuf/text_format.h"
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
-namespace memory = paddle::memory;
-
-void RunSerdeTestSelectedRows(platform::Place place) {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* slr = var.GetMutable<framework::SelectedRows>();
-  slr->set_height(1000);
-  auto* tensor = slr->mutable_value();
-  auto* rows = slr->mutable_rows();
-  tensor->Resize(framework::make_ddim({564, 128}));
-  tensor->mutable_data<float>(place);
-  int tensor_numel = 564 * 128;
-  math::set_constant(ctx, tensor, 32.7);
-  for (int i = 0; i < 564; ++i) rows->push_back(i);
-
-  ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-
-  // deserialize bytebuffer
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 1);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  const int64_t* rows_data =
-      reinterpret_cast<const int64_t*>(varmsg.rows().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
-  }
-  for (int i = 0; i < 564; ++i) {
-    EXPECT_EQ(rows_data[i], i);
-  }
-
-  // deserialize zero-copy
-  // framework::Variable var2;
-  // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
-  EXPECT_EQ(resp.Parse(msg), 0);
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto* slr2 = var2->GetMutable<framework::SelectedRows>();
-  auto* tensor2 = slr2->mutable_value();
-  auto* rows2 = slr2->mutable_rows();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2->data<float>());
-  }
-  const int64_t* rows_data2 = rows2->data();
-
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
-  }
-  for (size_t i = 0; i < rows2->size(); ++i) {
-    EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
-  }
-  EXPECT_EQ(slr2->height(), 1000);
-}
-
-void RunTestLodTensor(platform::Place place, int from_type = 0) {
-  // serialize var to ByteBuffer
-  framework::Variable var;
-  auto* tensor = var.GetMutable<framework::LoDTensor>();
-  tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
-  framework::LoD lod;
-  lod.push_back(framework::Vector<size_t>({1, 3, 8}));
-  tensor->set_lod(lod);
-  int tensor_numel = 512 * 8 * 4 * 2;
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& ctx = *pool.Get(place);
-  tensor->mutable_data<float>(place);
-  math::set_constant(ctx, tensor, 31.9);
-
-  ::grpc::ByteBuffer msg;
-  operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg,
-                                                "outvar", 0, "table_name");
-  EXPECT_GT(msg.Length(), static_cast<size_t>(0));
-
-  // deserialize
-  std::vector<::grpc::Slice> slices;
-  (void)msg.Dump(&slices);
-  std::string tmp;
-  for (const auto& s : slices) {
-    tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
-  }
-  sendrecv::VariableMessage varmsg;
-  EXPECT_TRUE(varmsg.ParseFromString(tmp));
-  EXPECT_EQ(varmsg.varname(), "myvar");
-  EXPECT_EQ(varmsg.type(), 0);
-  EXPECT_EQ(varmsg.dims()[0], 512);
-  EXPECT_EQ(varmsg.dims()[1], 8);
-  EXPECT_EQ(varmsg.dims()[2], 4);
-  EXPECT_EQ(varmsg.dims()[3], 2);
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-
-  const float* tensor_data =
-      reinterpret_cast<const float*>(varmsg.serialized().data());
-  for (int i = 0; i < tensor_numel; ++i) {
-    EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
-  }
-
-  // message binary
-  std::string str;
-  varmsg.SerializeToString(&str);
-
-  // message bytebuffer
-  ::grpc::Slice slices_2[1];
-  int num_slices = 1;
-  slices_2[0] = ::grpc::Slice(str.length());
-  memcpy(const_cast<uint8_t*>(slices_2[0].begin()), str.c_str(), str.length());
-  ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices);
-
-  // deserialize zero-copy
-  framework::Scope scope;
-  scope.Var("myvar");
-  operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
-  if (from_type == 0) {
-    EXPECT_EQ(resp.Parse(msg), 0);
-  } else {
-    EXPECT_EQ(resp.Parse(bytebuffer2), 0);
-  }
-
-  framework::Variable* var2 = resp.GetVar();
-
-  auto tensor2 = var2->Get<framework::LoDTensor>();
-  float* tensor_data2 = nullptr;
-  framework::Tensor tmp_tensor;
-
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-    platform::CPUPlace cpu;
-    framework::TensorCopy(tensor2, cpu, &tmp_tensor);
-    tensor_data2 = tmp_tensor.data<float>();
-  } else {
-    tensor_data2 = const_cast<float*>(tensor2.data<float>());
-  }
-
-  EXPECT_EQ(varmsg.lod_level(), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
-  EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
-  EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
-  for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
-}
-
-TEST(LodTensor, Run) {
-  platform::CPUPlace place;
-  RunTestLodTensor(place);
-  RunTestLodTensor(place, 1);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu(0);
-  RunTestLodTensor(gpu);
-  RunTestLodTensor(gpu, 1);
-#endif
-}
-
-TEST(SelectedRows, Run) {
-  platform::CPUPlace place;
-  RunSerdeTestSelectedRows(place);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  platform::CUDAPlace gpu;
-  RunSerdeTestSelectedRows(gpu);
-#endif
-}
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
deleted file mode 100644
index 912520d782d7568e5a6da63b7ac8bb35a5b95439..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ /dev/null
@@ -1,720 +0,0 @@
-/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <limits>
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_server.h"
-
-namespace grpc {
-class ChannelArguments;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-namespace operators {
-namespace distributed {
-class GRPCVariableResponse;
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
-
-using ::grpc::ServerAsyncResponseWriter;
-
-DECLARE_bool(rpc_disable_reuse_port);
-DECLARE_int32(rpc_retry_bind_port);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum CallStatus { PROCESS = 0, FINISH };
-
-// reference:
-// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
-class RequestBase {
- public:
-  explicit RequestBase(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : service_(service),
-        cq_(cq),
-        status_(PROCESS),
-        request_handler_(request_handler),
-        req_id_(req_id) {
-    PADDLE_ENFORCE_NOT_NULL(cq_, platform::errors::InvalidArgument(
-                                     "ServerCompletionQueue cq are empty"));
-  }
-  virtual ~RequestBase() {}
-  virtual void Process() = 0;
-
-  std::string Status2String(const std::string& method) {
-    std::string status = "Process";
-    if (status_ == FINISH) {
-      status = "Finish";
-    }
-
-    std::ostringstream s;
-    s << method << " name:[" << GetReqName() << "]"
-      << ", ep:[" << ctx_.peer() << "]"
-      << " " << status << " using req_id:" << req_id_;
-    return s.str();
-  }
-
-  CallStatus Status() const {
-    std::lock_guard<std::mutex> l(status_mu_);
-    return status_;
-  }
-
-  template <typename T>
-  void Finish(const T& reply, ServerAsyncResponseWriter<T>* responder) {
-    std::lock_guard<std::mutex> l(status_mu_);
-    status_ = FINISH;
-    responder->Finish(reply, ::grpc::Status::OK,
-                      reinterpret_cast<void*>(static_cast<intptr_t>(req_id_)));
-  }
-  virtual std::string GetReqName() = 0;
-
- protected:
-  mutable std::mutex status_mu_;
-  ::grpc::ServerContext ctx_;
-  GrpcService::AsyncService* service_;
-  ::grpc::ServerCompletionQueue* cq_;
-  CallStatus status_;
-  RequestHandler* request_handler_;
-  int req_id_;
-};
-
-class RequestSend final : public RequestBase {
- public:
-  explicit RequestSend(GrpcService::AsyncService* service,
-                       ::grpc::ServerCompletionQueue* cq,
-                       RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-  virtual ~RequestSend() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string varname = GetReqName();
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = request_->GetVar();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestSend var_name:" << varname << " trainer: " << trainer_id;
-
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VoidMessage reply_;
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestGet final : public RequestBase {
- public:
-  explicit RequestGet(GrpcService::AsyncService* service,
-                      ::grpc::ServerCompletionQueue* cq,
-                      RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id = static_cast<int>(distributed::GrpcMethod::kGetVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGet() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    std::string out_varname = request_.out_varname();
-    std::string table_name = request_.table_name();
-    int trainer_id = request_.trainer_id();
-
-    VLOG(4) << "RequestGet " << out_varname << " from " << varname;
-
-    auto scope = request_handler_->scope();
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    tmp_scope_ = std::move(scope->NewTmpScope());
-    request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar,
-                             trainer_id, out_varname, table_name);
-
-    VLOG(1) << "before SerializeToByteBuffer";
-    if (outvar) {
-      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
-                            &reply_);
-    }
-    VLOG(1) << "after SerializeToByteBuffer";
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  std::unique_ptr<framework::Scope> tmp_scope_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-class RequestGetNoBarrier final : public RequestBase {
- public:
-  explicit RequestGetNoBarrier(GrpcService::AsyncService* service,
-                               ::grpc::ServerCompletionQueue* cq,
-                               RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetVariableNoBarrier);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetNoBarrier() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    std::string out_varname = request_.out_varname();
-    int trainer_id = request_.trainer_id();
-
-    VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname;
-
-    auto scope = request_handler_->scope();
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
-                             out_varname);
-
-    if (outvar) {
-      SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
-                            &reply_);
-    }
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-class RequestGetMonomerVariable final : public RequestBase {
- public:
-  explicit RequestGetMonomerVariable(GrpcService::AsyncService* service,
-                                     ::grpc::ServerCompletionQueue* cq,
-                                     RequestHandler* request_handler,
-                                     int req_id, RPCServer* rpc_server)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        rpc_server_(rpc_server) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetMonomerVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetMonomerVariable() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-
-    rpc_server_->WaitVarCond(varname);
-    MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    auto scope = h.scope_;
-    auto invar = scope->FindVar(varname);
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar,
-                             request_.trainer_id());
-
-    if (outvar) {
-      SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_);
-    }
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  RPCServer* rpc_server_{nullptr};
-};
-
-class RequestGetMonomerBarrier final : public RequestBase {
- public:
-  explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service,
-                                    ::grpc::ServerCompletionQueue* cq,
-                                    RequestHandler* request_handler, int req_id,
-                                    RPCServer* rpc_server)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        rpc_server_(rpc_server) {
-    auto method_id =
-        static_cast<int>(distributed::GrpcMethod::kGetMonomerBarrier);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, &request_, &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestGetMonomerBarrier() {}
-
-  std::string GetReqName() override { return request_.varname(); }
-
-  void Process() override {
-    // proc request.
-    std::string varname = request_.varname();
-    VLOG(4) << "RequestGetMonomerBarrier " << varname;
-
-    rpc_server_->WaitVarCond(varname);
-    MonomerHandle h = rpc_server_->GetMonomer(varname);
-
-    framework::Scope* scope = nullptr;
-    framework::Variable* invar = nullptr;
-    framework::Variable* outvar = nullptr;
-
-    request_handler_->Handle(varname, scope, invar, &outvar,
-                             request_.trainer_id());
-
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VariableMessage request_;
-  sendrecv::VoidMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-  RPCServer* rpc_server_{nullptr};
-};
-
-class RequestPrefetch final : public RequestBase {
- public:
-  explicit RequestPrefetch(GrpcService::AsyncService* service,
-                           ::grpc::ServerCompletionQueue* cq,
-                           RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id),
-        responder_(&ctx_),
-        local_scope_(nullptr) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestPrefetch() {}
-
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    // prefetch process...
-    std::string in_var_name = request_->Varname();
-    std::string out_var_name = request_->OutVarname();
-    std::string table_name = request_->TableName();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name << " trainer: " << trainer_id;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    // out var must be created in local scope!
-    framework::Variable* outvar = scope->Var(out_var_name);
-
-    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name, table_name);
-
-    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
-                          &reply_);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-  framework::Scope* local_scope_;
-};
-
-class RequestCheckpointNotify final : public RequestBase {
- public:
-  explicit RequestCheckpointNotify(GrpcService::AsyncService* service,
-                                   ::grpc::ServerCompletionQueue* cq,
-                                   RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx()));
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestCheckpointNotify() {}
-
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    auto scope = request_->GetMutableLocalScope();
-
-    std::string checkpoint_notify = request_->Varname();
-    std::string checkpoint_dir = request_->OutVarname();
-    int trainer_id = request_->GetTrainerId();
-    std::string table_name = request_->TableName();
-
-    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
-            << ", dir: " << checkpoint_dir;
-
-    request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr,
-                             trainer_id, checkpoint_dir, table_name);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  sendrecv::VoidMessage reply_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestNotify final : public RequestBase {
- public:
-  explicit RequestNotify(GrpcService::AsyncService* service,
-                         ::grpc::ServerCompletionQueue* cq,
-                         RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-    int method_id = static_cast<int>(distributed::GrpcMethod::kRequestNotify);
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-  virtual ~RequestNotify() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string varname = GetReqName();
-    VLOG(4) << "RequestNotify var_name:" << varname;
-
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = request_->GetVar();
-    int trainer_id = request_->GetTrainerId();
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  sendrecv::VoidMessage reply_;
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
-};
-
-class RequestSendAndRecv final : public RequestBase {
- public:
-  explicit RequestSendAndRecv(GrpcService::AsyncService* service,
-                              ::grpc::ServerCompletionQueue* cq,
-                              RequestHandler* request_handler, int req_id)
-      : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
-    request_.reset(new GRPCVariableResponse(request_handler->scope(),
-                                            request_handler->dev_ctx(), true));
-
-    int method_id =
-        static_cast<int>(distributed::GrpcMethod::kRequestSendAndRecv);
-
-    service_->RequestAsyncUnary(
-        method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
-        reinterpret_cast<void*>(static_cast<intptr_t>(req_id)));
-  }
-
-  virtual ~RequestSendAndRecv() {}
-  std::string GetReqName() override { return request_->Varname(); }
-
-  void Process() override {
-    std::string in_var_name = request_->Varname();
-    std::string out_var_name = request_->OutVarname();
-    std::string table_name = request_->TableName();
-    int trainer_id = request_->GetTrainerId();
-
-    VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name
-            << " out_var_name: " << out_var_name << " trainer: " << trainer_id;
-    auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(in_var_name);
-    framework::Variable* outvar = nullptr;
-    request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
-                             out_var_name, table_name);
-    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
-                          &reply_);
-    Finish(reply_, &responder_);
-  }
-
- protected:
-  std::shared_ptr<GRPCVariableResponse> request_;
-  ::grpc::ByteBuffer reply_;
-  ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
-};
-
-void AsyncGRPCServer::WaitServerReady() {
-  VLOG(4) << "AsyncGRPCServer is waiting server ready";
-  std::unique_lock<std::mutex> lock(this->mutex_ready_);
-  condition_ready_.wait(lock, [=] { return this->ready_ == 1; });
-  VLOG(4) << "AsyncGRPCServer WaitSeverReady";
-}
-
-// Define an option subclass in order to disable SO_REUSEPORT for the
-// server socket.
-// Come from:
-// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
-class NoReusePortOption : public ::grpc::ServerBuilderOption {
- public:
-  void UpdateArguments(::grpc::ChannelArguments* args) override {
-    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0);
-  }
-
-  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
-                         plugins) override {}
-};
-
-void AsyncGRPCServer::StartServer() {
-  for (int i = 0; i < FLAGS_rpc_retry_bind_port; i++) {
-    ::grpc::ServerBuilder builder;
-    std::unique_ptr<GrpcService::AsyncService> service(
-        new GrpcService::AsyncService());
-    builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(),
-                             &selected_port_);
-
-    builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
-    builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
-    if (FLAGS_rpc_disable_reuse_port) {
-      builder.SetOption(
-          std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
-      LOG(INFO) << "set FLAGS_rpc_disable_reuse_port";
-    }
-    builder.RegisterService(service.get());
-
-    for (auto t : rpc_call_map_) {
-      rpc_cq_[t.first].reset(builder.AddCompletionQueue().release());
-    }
-
-    server_ = builder.BuildAndStart();
-    if (selected_port_ != 0) {
-      LOG(INFO) << "Server listening on " << bind_address_
-                << " successful, selected port: " << selected_port_;
-      service_.reset(service.release());
-      break;
-    }
-
-    LOG(WARNING) << "Server listening on " << bind_address_
-                 << " failed, selected port: " << selected_port_
-                 << ", retry after 3 seconds!";
-
-    sleep(3);
-  }
-
-  PADDLE_ENFORCE_NE(
-      selected_port_, 0,
-      platform::errors::Unavailable("can't bind to address:%s", bind_address_));
-
-  std::function<void(const std::string&, int)> f =
-      std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this,
-                std::placeholders::_1, std::placeholders::_2);
-
-  for (auto& t : rpc_call_map_) {
-    auto& rpc_name = t.first;
-    auto& cq = rpc_cq_[rpc_name];
-    auto threadnum = rpc_thread_num_[rpc_name];
-    auto& reqs = rpc_reqs_[rpc_name];
-
-    reqs.reserve(kRequestBufSize);
-
-    for (int i = 0; i < kRequestBufSize; i++) {
-      VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i;
-      TryToRegisterNewOne(rpc_name, i);
-    }
-
-    for (int i = 0; i < threadnum; i++) {
-      rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind(
-          &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f)));
-      VLOG(4) << t.first << " creates threads!";
-    }
-  }
-
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_ready_);
-    ready_ = 1;
-  }
-  condition_ready_.notify_all();
-
-  // wait server
-  server_->Wait();
-
-  for (auto& t : rpc_threads_) {
-    auto& threads = t.second;
-    for (size_t i = 0; i < threads.size(); ++i) {
-      threads[i]->join();
-      VLOG(4) << t.first << " threads ends!";
-    }
-  }
-}
-
-void AsyncGRPCServer::ShutdownQueue() {
-  for (auto& t : rpc_cq_) {
-    t.second->Shutdown();
-    VLOG(4) << t.first << " queue shutdown!";
-  }
-}
-
-void AsyncGRPCServer::ShutDownImpl() {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  is_shut_down_ = true;
-  ShutdownQueue();
-
-  VLOG(4) << "server_ shutdown!";
-  server_->Shutdown();
-}
-
-void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
-                                          int req_id) {
-  std::unique_lock<std::mutex> lock(cq_mutex_);
-  if (is_shut_down_) {
-    VLOG(4) << "shutdown, do not TryToRegisterNewSendOne";
-    return;
-  }
-
-  VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name
-          << " REQ ID: " << req_id;
-
-  auto& reqs = rpc_reqs_[rpc_name];
-  auto& handler = rpc_call_map_[rpc_name];
-  auto& cq = rpc_cq_[rpc_name];
-
-  RequestBase* b = nullptr;
-  if (rpc_name == kRequestSend) {
-    b = new RequestSend(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestGet) {
-    b = new RequestGet(service_.get(), cq.get(), handler, req_id);
-
-  } else if (rpc_name == kRequestGetNoBarrier) {
-    b = new RequestGetNoBarrier(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestGetMonomerVariable) {
-    b = new RequestGetMonomerVariable(service_.get(), cq.get(), handler, req_id,
-                                      this);
-  } else if (rpc_name == kRequestGetMonomerBarrier) {
-    b = new RequestGetMonomerBarrier(service_.get(), cq.get(), handler, req_id,
-                                     this);
-  } else if (rpc_name == kRequestPrefetch) {
-    b = new RequestPrefetch(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestCheckpoint) {
-    b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestNotify) {
-    b = new RequestNotify(service_.get(), cq.get(), handler, req_id);
-  } else if (rpc_name == kRequestSendAndRecv) {
-    b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id);
-  } else {
-    PADDLE_THROW(
-        platform::errors::InvalidArgument("not supported rpc: %s", rpc_name));
-  }
-
-  reqs[req_id] = b;
-
-  VLOG(4) << "TryToRegisterNewOne status:" << b->Status();
-}
-
-void AsyncGRPCServer::HandleRequest(
-    ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-    std::function<void(const std::string&, int)> TryToRegisterNewOne) {
-  void* tag = NULL;
-  bool ok = false;
-
-  while (true) {
-    VLOG(4) << "HandleRequest " << rpc_name << " wait next";
-    if (!cq->Next(&tag, &ok)) {
-      VLOG(4) << "CompletionQueue " << rpc_name << " shutdown!";
-      break;
-    }
-
-    int req_id = static_cast<int>(reinterpret_cast<intptr_t>(tag));
-    VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id
-            << " get next";
-
-    auto& reqs = rpc_reqs_[rpc_name];
-    RequestBase* base = nullptr;
-    {
-      PADDLE_ENFORCE_EQ(
-          (req_id >= 0 && req_id < kRequestBufSize), true,
-          platform::errors::OutOfRange("request id: %s out of bounds: [0, %s)",
-                                       req_id, kRequestBufSize));
-      std::unique_lock<std::mutex> lock(cq_mutex_);
-      base = reqs[req_id];
-    }
-
-    VLOG(3) << base->Status2String(rpc_name);
-
-    // reference:
-    // https://github.com/tensorflow/tensorflow/issues/5596
-    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
-    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
-    if (!ok) {
-      VLOG(4) << "completion queue:" << rpc_name << " recv no regular event"
-              << " context:" << base->Status2String(rpc_name);
-      TryToRegisterNewOne(rpc_name, req_id);
-      delete base;
-      continue;
-    }
-
-    switch (base->Status()) {
-      case PROCESS: {
-        base->Process();
-        break;
-      }
-      case FINISH: {
-        TryToRegisterNewOne(rpc_name, req_id);
-        delete base;
-        break;
-      }
-      default: { assert(false); }
-    }
-  }
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h
deleted file mode 100644
index 3d68b7e8cebb400680458a1163d52b01f8c8dc2e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <utility>
-#include <vector>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_service.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace grpc {
-class ServerCompletionQueue;
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestBase;
-
-class AsyncGRPCServer final : public RPCServer {
- public:
-  explicit AsyncGRPCServer(const std::string& address, int client_num)
-      : RPCServer(address, client_num), ready_(0) {}
-
-  virtual ~AsyncGRPCServer() {}
-  void WaitServerReady() override;
-  void StartServer() override;
-
- private:
-  // HandleRequest needs to be thread-safe.
-  void HandleRequest(
-      ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name,
-      std::function<void(const std::string&, int)> TryToRegisterNewOne);
-
-  void TryToRegisterNewOne(const std::string& rpc_name, int req_id);
-  void ShutdownQueue();
-  void ShutDownImpl() override;
-
- private:
-  static const int kRequestBufSize = 100;
-
-  std::mutex cq_mutex_;
-  volatile bool is_shut_down_ = false;
-
-  std::unique_ptr<GrpcService::AsyncService> service_;
-  std::unique_ptr<::grpc::Server> server_;
-
-  // condition of the sub program
-  std::condition_variable barrier_condition_;
-
-  std::mutex mutex_ready_;
-  std::condition_variable condition_ready_;
-
-  int ready_;
-
-  std::map<std::string, std::unique_ptr<::grpc::ServerCompletionQueue>> rpc_cq_;
-  std::map<std::string, std::vector<std::unique_ptr<std::thread>>> rpc_threads_;
-  std::map<std::string, std::vector<RequestBase*>> rpc_reqs_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h
deleted file mode 100644
index 10037c90853debb37a08921db2bfc5968dc7094e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_service.h
+++ /dev/null
@@ -1,145 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <grpc++/impl/codegen/async_stream.h>
-#include <grpc++/impl/codegen/async_unary_call.h>
-#include <grpc++/impl/codegen/proto_utils.h>
-#include <grpc++/impl/codegen/rpc_method.h>
-#include <grpc++/impl/codegen/service_type.h>
-#include <grpc++/impl/codegen/status.h>
-#include <grpc++/impl/codegen/stub_options.h>
-#include <grpc++/impl/codegen/sync_stream.h>
-#include <grpc++/support/byte_buffer.h>
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/platform/profiler.h"
-
-// NOTE: This method was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       method and did some modifications so that we can parse gRPC
-//       requests without too much copying of the tensor data.
-
-namespace grpc {
-class CompletionQueue;
-class Channel;
-class RpcService;
-class ServerCompletionQueue;
-class ServerContext;
-
-// Support parsing/unparsing of tensorflow::VariableResponse.
-// Wire-format is identical to RecvVariableResponse.
-template <>
-class SerializationTraits<
-    paddle::operators::distributed::GRPCVariableResponse> {
- public:
-  static Status Serialize(
-      const paddle::operators::distributed::GRPCVariableResponse& msg,
-      grpc_byte_buffer** bp, bool* own_buffer) {
-    PADDLE_THROW(paddle::platform::errors::Unimplemented(
-        "SerializationTraits::Serialize not implemented!"));
-    return Status();
-  }
-  static Status Deserialize(
-      grpc_byte_buffer* buffer,
-      paddle::operators::distributed::GRPCVariableResponse* msg,
-      int max_message_size = INT_MAX) {
-    if (buffer == nullptr) {
-      return Status(StatusCode::INTERNAL, "No payload");
-    }
-
-    Status result = g_core_codegen_interface->ok();
-    if (result.ok()) {
-      paddle::operators::distributed::GrpcByteSource source(buffer);
-      int ret = msg->Parse(&source);
-      if (ret != 0) {
-        result = Status(StatusCode::INTERNAL, "VariableResponse parse error");
-      }
-    }
-    g_core_codegen_interface->grpc_byte_buffer_destroy(buffer);
-    return result;
-  }
-};
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum class GrpcMethod {
-  kSendVariable,
-  kGetVariable,
-  kPrefetchVariable,
-  kCheckpointNotify,
-  kGetVariableNoBarrier,
-  kGetMonomerVariable,
-  kGetMonomerBarrier,
-  kRequestNotify,
-  kRequestSendAndRecv,
-  // when you add new handler, change kGrpcNumMethods at the same time!
-};
-
-static const int kGrpcNumMethods =
-    static_cast<int>(GrpcMethod::kRequestSendAndRecv) + 1;
-
-inline const char* GrpcMethodName(GrpcMethod id) {
-  switch (id) {
-    case GrpcMethod::kSendVariable:
-      return "/sendrecv.SendRecvService/SendVariable";
-    case GrpcMethod::kGetVariable:
-      return "/sendrecv.SendRecvService/GetVariable";
-    case GrpcMethod::kGetVariableNoBarrier:
-      return "/sendrecv.SendRecvService/GetVariableNoBarrier";
-    case GrpcMethod::kGetMonomerVariable:
-      return "/sendrecv.SendRecvService/GetMonomerVariable";
-    case GrpcMethod::kGetMonomerBarrier:
-      return "/sendrecv.SendRecvService/GetMonomerBarrier";
-    case GrpcMethod::kPrefetchVariable:
-      return "/sendrecv.SendRecvService/PrefetchVariable";
-    case GrpcMethod::kCheckpointNotify:
-      return "/sendrecv.SendRecvService/CheckpointNotify";
-    case GrpcMethod::kRequestNotify:
-      return "/sendrecv.SendRecvService/DistributeNotify";
-    case GrpcMethod::kRequestSendAndRecv:
-      return "/sendrecv.SendRecvService/SendAndRecvVariable";
-  }
-
-  // Shouldn't be reached.
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "Invalid id: not found valid method name"));
-  return nullptr;
-}
-
-class GrpcService final {
- public:
-  class AsyncService : public ::grpc::Service {
-   public:
-    AsyncService() {
-      for (int i = 0; i < kGrpcNumMethods; ++i) {
-        AddMethod(new ::grpc::internal::RpcServiceMethod(
-            GrpcMethodName(static_cast<GrpcMethod>(i)),
-            ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
-        ::grpc::Service::MarkMethodAsync(i);
-      }
-    }
-    virtual ~AsyncService() {}
-
-    // Make RequestAsyncUnary public for grpc_call.h
-    using ::grpc::Service::RequestAsyncUnary;
-  };
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
deleted file mode 100644
index f7679e9fc924dfe810bf6375787e2ebc4f75dd3e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <stdint.h>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h"
-#include "paddle/fluid/operators/distributed/send_recv.pb.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace google {
-namespace protobuf {
-namespace io {
-class ZeroCopyInputStream;
-}  // namespace io
-}  // namespace protobuf
-}  // namespace google
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum WireType {
-  WIRETYPE_VARINT = 0,
-  WIRETYPE_LENGTH_DELIMITED = 2,
-};
-
-inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
-
-inline WireType GetTagWireType(uint32_t tag) {
-  return static_cast<WireType>(tag & 0x7);
-}
-
-bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
-                         int* result) {
-  uint64_t v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
-    *result = static_cast<int>(v);
-    return true;
-  } else {
-    return false;
-  }
-}
-
-int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
-  GrpcByteBufferSource source;
-  source.Init(byte_buffer);
-  GrpcByteBufferSourceWrapper r(&source);
-
-  return Parse(&r);
-}
-
-bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
-                  std::vector<int64_t>* lod) {
-  while (true) {
-    auto p = input->ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-
-    if (!p.second) {
-      return (tag == 0);
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
-        uint64_t v;
-        if (wt == WIRETYPE_VARINT) {
-          if (!input->ReadVarint64(&v)) {
-            return false;
-          }
-          lod->push_back(v);
-          break;
-        }
-
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input->ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input->CurrentPosition();
-          while (input->CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input->ReadVarint64(&v)) {
-              return tag;
-            }
-            lod->push_back(v);
-          }
-          break;
-        }
-
-        return false;
-      }
-      default: { return false; }
-    }
-  }
-
-  return true;
-}
-
-int GRPCVariableResponse::Parse(Source* source) {
-  ::google::protobuf::io::ZeroCopyInputStream* input_stream =
-      source->contents();
-  ::google::protobuf::io::CodedInputStream input(input_stream);
-  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
-
-  while (true) {
-    auto p = input.ReadTagWithCutoff(127);
-    int tag = GetTagFieldNumber(p.first);
-    WireType wt = GetTagWireType(p.first);
-    if (!p.second) {
-      if (tag != 0) {
-        return -1;
-      }
-      return 0;
-    }
-
-    switch (tag) {
-      case sendrecv::VariableMessage::kVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kTypeFieldNumber: {
-        uint32_t v;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_type(static_cast<::sendrecv::VarType>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDataTypeFieldNumber: {
-        uint32_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
-          return tag;
-        }
-
-        meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kDimsFieldNumber: {
-        // not packed
-        if (wt == WIRETYPE_VARINT) {
-          uint64_t v;
-          if (!input.ReadVarint64(&v)) {
-            return tag;
-          }
-          meta_.add_dims(v);
-          break;
-        }
-
-        // packed
-        if (wt == WIRETYPE_LENGTH_DELIMITED) {
-          int num_bytes = 0;
-          if (!input.ReadVarintSizeAsInt(&num_bytes)) {
-            return tag;
-          }
-          int start_pos = input.CurrentPosition();
-          while (input.CurrentPosition() - start_pos < num_bytes) {
-            uint64_t v;
-            if (!input.ReadVarint64(&v)) {
-              return tag;
-            }
-            meta_.add_dims(v);
-          }
-          break;
-        }
-        return tag;
-      }
-      case sendrecv::VariableMessage::kLodLevelFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_lod_level(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kLodFieldNumber: {
-        int length = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &length)) {
-          return tag;
-        }
-
-        std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
-            input.IncrementRecursionDepthAndPushLimit(length);
-
-        std::vector<int64_t> lod_data;
-        if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
-          return tag;
-        }
-
-        if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
-          return tag;
-        }
-
-        if (lod_data.size() == 0) {
-          break;
-        }
-
-        auto lod = meta_.add_lod();
-        for (uint32_t i = 0; i < lod_data.size(); i++) {
-          lod->add_lod_data(lod_data[i]);
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
-        uint64_t v = 0;
-        if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
-          return tag;
-        }
-        meta_.set_slr_height(static_cast<int64_t>(v));
-        break;
-      }
-      case sendrecv::VariableMessage::kSerializedFieldNumber: {
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!ProcSerializedField(tag, &input, num_bytes)) {
-          return tag;
-        }
-
-        break;
-      }
-      case sendrecv::VariableMessage::kRowsFieldNumber: {
-        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
-                        meta_.type() == sendrecv::LOD_TENSOR) &&
-                           meta_.varname() != "",
-                       platform::errors::PreconditionNotMet(
-                           "meta info should be got first!"));
-
-        int num_bytes = 0;
-        if (wt != WIRETYPE_LENGTH_DELIMITED ||
-            !ReadVarintSizeAsInt(&input, &num_bytes)) {
-          return tag;
-        }
-
-        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
-          return tag;
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_out_varname(temp);
-        break;
-      }
-      case sendrecv::VariableMessage::kProfileFieldNumber: {
-        uint64_t profiling = 0;
-        if (!input.ReadVarint64(&profiling)) {
-          return tag;
-        }
-        meta_.set_profile(profiling);
-        int64_t listener_id = platform::ListenerId();
-        if (listener_id <= 0) {
-          break;
-        }
-        if (profiling == platform::kEnableProfiler &&
-            !platform::IsProfileEnabled()) {
-          platform::EnableProfiler(platform::ProfilerState::kCPU);
-        } else if (profiling == platform::kDisableProfiler &&
-                   platform::IsProfileEnabled()) {
-          platform::DisableProfiler(
-              platform::EventSortingKey::kDefault,
-              string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path,
-                              listener_id));
-        }
-        break;
-      }
-      case sendrecv::VariableMessage::kTrainerIdFieldNumber: {
-        uint64_t trainer_id = 0;
-        if (!input.ReadVarint64(&trainer_id)) {
-          return tag;
-        }
-        meta_.set_trainer_id(trainer_id);
-        break;
-      }
-      case sendrecv::VariableMessage::kTableNameFieldNumber: {
-        uint32_t length;
-        if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
-          return tag;
-        }
-
-        std::string temp;
-        if (!input.ReadString(&temp, length)) {
-          return tag;
-        }
-
-        meta_.set_table_name(temp);
-        break;
-      }
-      default: {
-        // Unknown tag, return unknown error.
-        return -1;
-      }
-    }
-  }
-
-  return 0;
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
deleted file mode 100644
index 4d12b4a4bacd7ffee6ac7725951b967f7eb2da15..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h"
-#include "paddle/fluid/operators/distributed/variable_response.h"
-
-namespace grpc {
-class ByteBuffer;
-}  // namespace grpc
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class GRPCVariableResponse : public VariableResponse {
- public:
-  GRPCVariableResponse(const framework::Scope* scope,
-                       const platform::DeviceContext* dev_ctx,
-                       bool create_scope = false)
-      : VariableResponse(scope, dev_ctx, create_scope) {}
-
-  virtual ~GRPCVariableResponse() {}
-
-  int Parse(Source* source) override;
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  int Parse(const ::grpc::ByteBuffer& byte_buffer);
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.cc b/paddle/fluid/operators/distributed/heart_beat_monitor.cc
deleted file mode 100644
index 9f537f533489860447532e80a08f48ac2750e48c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-
-#include <ctime>
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-DEFINE_int32(worker_update_interval_secs, 900,
-             " the longest time interval between the worker update variables");
-
-inline int GetCurrentUS() {
-  // current date/time based on current system
-  time_t t = std::time(0);
-  int now = static_cast<int>(t);
-  return now;
-}
-
-void HeartBeatMonitor::Update(const int worker_id, std::string be_monitored_var,
-                              WorkerStatus status) {
-  if (status == UNINITED) {
-    LOG(WARNING) << "HeartBeatMonitor receive UNINITED status can not be used "
-                    "in Update, something error";
-  }
-
-  if (!is_chief_) {
-    return;
-  }
-
-  if ((be_monitored_var == be_monitored_var_ && status == RUNNING) ||
-      status == COMPLETED) {
-    auto timestamp = GetCurrentUS();
-    UnderMonitoredWorker& worker = worker_status_map_.at(worker_id);
-
-    if (worker.status != COMPLETED) {
-      worker.status = status;
-    }
-    worker.timestamp = timestamp;
-    return;
-  }
-}
-
-void HeartBeatMonitor::LostWorkerMonitor() {
-  VLOG(1) << "worker heartbeat monitor start at No.0 parameter server";
-  while (running_) {
-    for (int id = 0; id < workers_; ++id) {
-      auto& worker = worker_status_map_.at(id);
-
-      if (worker.status == UNINITED) {
-        VLOG(4) << "worker " << worker.id << " is under UNINITED";
-        continue;
-      }
-      if (worker.status == COMPLETED) {
-        VLOG(4) << "worker " << worker.id << " is under COMPLETED";
-        continue;
-      }
-
-      auto timestamp = GetCurrentUS();
-
-      VLOG(4) << "worker " << worker.id << " status is " << worker.status
-              << " timestamp is " << worker.timestamp << " the interval is "
-              << timestamp - worker.timestamp;
-
-      if (timestamp - worker.timestamp >= FLAGS_worker_update_interval_secs) {
-        PADDLE_THROW(platform::errors::ExecutionTimeout(
-            "the latest update of worker %d is %d secs ago, we doubt the "
-            "the worker is not alive and this may have a bad effect on the "
-            "fitting result, please check",
-            worker.id, FLAGS_worker_update_interval_secs));
-      }
-    }
-
-    std::this_thread::sleep_for(std::chrono::milliseconds(10 * 1000));
-  }
-  VLOG(1) << "worker heartbeat monitor stopped, thread exit";
-}
-
-std::once_flag HeartBeatMonitor::init_flag_;
-std::unique_ptr<HeartBeatMonitor> HeartBeatMonitor::monitor_(nullptr);
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.h b/paddle/fluid/operators/distributed/heart_beat_monitor.h
deleted file mode 100644
index d96433c318b3578446ffe484d40711d2559f91ab..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/heart_beat_monitor.h
+++ /dev/null
@@ -1,127 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum WorkerStatus { UNINITED = 0, RUNNING, COMPLETED };
-
-struct UnderMonitoredWorker {
-  int id;
-  WorkerStatus status;
-  int timestamp;
-
-  UnderMonitoredWorker() {}
-
-  explicit UnderMonitoredWorker(int worker_id) {
-    this->id = worker_id;
-    this->status = UNINITED;
-    this->timestamp = 0;
-  }
-};
-
-class HeartBeatMonitor {
- public:
-  explicit HeartBeatMonitor(int workers, bool is_chief,
-                            std::string be_monitored_var)
-      : workers_(workers),
-        is_chief_(is_chief),
-        be_monitored_var_(be_monitored_var),
-        running_(true) {
-    PADDLE_ENFORCE_GT(workers, 0, platform::errors::InvalidArgument(
-                                      "workers must greater than 0."));
-
-    for (auto worker_id = 0; worker_id < workers; worker_id++) {
-      UnderMonitoredWorker worker(worker_id);
-      worker_status_map_[worker_id] = std::move(worker);
-    }
-
-    // we define the No.0 pserver is the first parameter server
-    // only No.0 will check the heartbeat of all trainers
-    if (is_chief) {
-      monitor_thread_.reset(new std::thread(
-          std::bind(&HeartBeatMonitor::LostWorkerMonitor, this)));
-    }
-  }
-
-  ~HeartBeatMonitor() {
-    running_ = false;
-    if (monitor_thread_) monitor_thread_->join();
-  }
-
-  static void Init(int workers, bool is_chief, std::string be_monitored_var) {
-    std::call_once(init_flag_, &HeartBeatMonitor::InitImpl, workers, is_chief,
-                   be_monitored_var);
-  }
-
-  static HeartBeatMonitor* GetInstance() { return monitor_.get(); }
-
-  void Stop() {
-    running_ = false;
-    if (!monitor_) {
-      VLOG(0) << "HeartBeatMonitor is not inited, do nothing";
-    } else {
-      if (monitor_thread_) {
-        monitor_thread_->join();
-        monitor_thread_.reset(nullptr);
-      }
-    }
-  }
-
-  void Update(const int worker_id, std::string be_monitored_var,
-              WorkerStatus status);
-
-  void LostWorkerMonitor();
-
- private:
-  // Init is called by GetInstance.
-  static void InitImpl(int workers, bool is_chief,
-                       std::string be_monitored_var) {
-    if (monitor_ == nullptr) {
-      monitor_.reset(new HeartBeatMonitor(workers, is_chief, be_monitored_var));
-    }
-  }
-
-  static std::once_flag init_flag_;
-  static std::unique_ptr<HeartBeatMonitor> monitor_;
-
-  int workers_;
-  bool is_chief_;
-  std::string be_monitored_var_;
-  std::unordered_map<int, UnderMonitoredWorker> worker_status_map_;
-  std::unique_ptr<std::thread> monitor_thread_{nullptr};
-  std::mutex mutex_;
-  bool running_ = false;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
deleted file mode 100644
index 8505023f63a95d604749be1de787c7688ce27848..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-
-#include "gtest/gtest.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-void run(HeartBeatMonitor* monitor) { monitor->LostWorkerMonitor(); }
-
-TEST(HeartBeatMonitor, All) {
-  int trainers = 10;
-  int pserver_id = 0;
-  std::string var = "nce_w@GRAD.block0";
-  std::string var2 = "nce_w@GRAD.block2";
-
-  HeartBeatMonitor::Init(trainers, pserver_id == 0, var);
-
-  auto* monitor = HeartBeatMonitor::GetInstance();
-
-  std::vector<int> ids{1, 3, 5, 7};
-
-  for (auto& id : ids) {
-    monitor->Update(id, var, RUNNING);
-  }
-
-  monitor->Update(9, var2, RUNNING);
-  monitor->Update(2, var, COMPLETED);
-
-  std::thread t(run, monitor);
-  t.detach();
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(15 * 1000));
-
-  monitor->Stop();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h
deleted file mode 100644
index da2281231fc8a339fbc2b4cc0fed841bd24e9645..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/large_scale_kv.h
+++ /dev/null
@@ -1,848 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <ThreadPool.h>
-#include <functional>
-#include <future>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/rw_lock.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-enum Mode { training, infer };
-enum InitType { uniform_random, fill_constant, gaussian_random };
-
-inline std::vector<int> bucket(const int v_size, const int b_size) {
-  int remainder = v_size % b_size;
-  int bucket = v_size / b_size;
-  std::vector<int> ret_vec(b_size, bucket);
-  for (int i = 0; i < remainder; ++i) {
-    ret_vec[i] = ret_vec[i] + 1;
-  }
-  int cur_bucket = 0;
-  for (int &j : ret_vec) {
-    int tmp = j;
-    j = cur_bucket;
-    cur_bucket += tmp;
-  }
-  ret_vec.push_back(cur_bucket);
-  return ret_vec;
-}
-
-class Initializer {
- public:
-  Initializer() {}
-
-  explicit Initializer(const std::vector<std::string> &attrs) {}
-
-  virtual float GetValue() = 0;
-
-  virtual ~Initializer() {}
-
- protected:
-  std::string name_;
-  unsigned int seed_;
-};
-
-class UniformInitializer : public Initializer {
- public:
-  explicit UniformInitializer(const std::vector<std::string> &attrs) {
-    name_ = attrs[0];
-    seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
-    min_ = std::stof(attrs[2]);
-    max_ = std::stof(attrs[3]);
-
-    dist_ = std::uniform_real_distribution<float>(min_, max_);
-    random_engine_ = framework::GetCPURandomEngine(seed_);
-  }
-
-  float GetValue() override { return dist_(*random_engine_); }
-
- private:
-  float min_;
-  float max_;
-
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::uniform_real_distribution<float> dist_;
-};
-
-template <typename T>
-inline bool entry(const int count, const T threshold);
-
-template <>
-inline bool entry<std::string>(const int count, const std::string threshold) {
-  return true;
-}
-
-template <>
-inline bool entry<int>(const int count, const int threshold) {
-  return count >= threshold;
-}
-
-template <>
-inline bool entry<float>(const int count, const float threshold) {
-  UniformInitializer uniform = UniformInitializer({"0", "0", "1"});
-  return uniform.GetValue() >= threshold;
-}
-
-class GaussianInitializer : public Initializer {
- public:
-  explicit GaussianInitializer(const std::vector<std::string> &attrs) {
-    name_ = attrs[0];
-    seed_ = static_cast<unsigned int>(std::stoi(attrs[1]));
-    mean_ = std::stof(attrs[2]);
-    std_ = std::stof(attrs[3]);
-
-    random_engine_ = framework::GetCPURandomEngine(seed_);
-
-    dist_ = std::normal_distribution<float>(mean_, std_);
-  }
-
-  float GetValue() override { return dist_(*random_engine_); }
-
- private:
-  float std_;
-  float mean_;
-
-  std::shared_ptr<std::mt19937_64> random_engine_;
-  std::normal_distribution<float> dist_;
-};
-
-class FillConstantInitializer : public Initializer {
- public:
-  explicit FillConstantInitializer(const std::vector<std::string> &attrs) {
-    name_ = attrs[0];
-    value_ = std::stof(attrs[1]);
-  }
-
-  float GetValue() override { return value_; }
-
- private:
-  float value_;
-};
-
-struct SparseMeta {
-  std::string name;
-  std::string grad_name;
-  std::vector<std::string> value_names;
-  std::vector<int> value_dims;
-  std::vector<std::string> cached_varnames;
-  std::vector<std::string> initializer_attrs;
-  std::string entry;
-  Mode mode;
-
-  std::string ToString() {
-    std::stringstream ss;
-    ss << "name: " << name << " ";
-    ss << "mode: " << mode << " ";
-
-    for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
-      ss << "value_name: " << value_names[i] << " dim: " << value_dims[i]
-         << " ";
-    }
-
-    ss << " grad var: " << grad_name;
-
-    ss << " cached varnames: ";
-    for (int i = 0; i < static_cast<int>(cached_varnames.size()); i++) {
-      ss << cached_varnames[i] << " ";
-    }
-
-    ss << " initializer attrs: ";
-    for (int i = 0; i < static_cast<int>(initializer_attrs.size()); i++) {
-      ss << initializer_attrs[i] << " ";
-    }
-
-    ss << " entry attrs: " << entry;
-
-    return ss.str();
-  }
-};
-
-struct VALUE {
-  explicit VALUE(const std::vector<std::string> &names)
-      : names_(names), count_(0), unseen_days_(0) {
-    values_.resize(names.size());
-    for (int i = 0; i < static_cast<int>(names.size()); i++) {
-      places[names[i]] = i;
-    }
-  }
-
-  void set(std::vector<std::vector<float>> *values) {
-    values_ = std::move(*values);
-  }
-
-  void set(const std::vector<std::string> &names,
-           const std::vector<std::vector<float>> &values) {
-    for (int i = 0; i < static_cast<int>(names.size()); i++) {
-      auto idx = places[names[i]];
-      auto value = values[i];
-      values_[idx].assign(value.begin(), value.end());
-    }
-  }
-
-  std::vector<std::vector<float> *> get() {
-    auto pts = std::vector<std::vector<float> *>();
-    pts.reserve(values_.size());
-
-    for (auto &value : values_) {
-      pts.push_back(&value);
-    }
-    return pts;
-  }
-
-  int fetch_count() { return ++count_; }
-  void reset_unseen_days() { unseen_days_ = 0; }
-
-  void set_entry(bool is_entry) { is_entry_ = is_entry; }
-
-  bool get_entry() { return is_entry_; }
-
-  std::vector<std::vector<float> *> get(const std::vector<std::string> names) {
-    auto pts = std::vector<std::vector<float> *>();
-    pts.reserve(values_.size());
-
-    for (int i = 0; i < static_cast<int>(names.size()); i++) {
-      pts.push_back(&(values_[places[names[i]]]));
-    }
-    return pts;
-  }
-
-  std::vector<std::string> names_;
-  int count_;
-  bool seen_after_last_save_;
-  int unseen_days_;
-  bool is_entry_;
-  std::vector<std::vector<float>> values_;
-  std::unordered_map<std::string, int> places;
-};
-
-class ValueBlock {
- public:
-  explicit ValueBlock(const std::vector<std::string> value_names,
-                      const std::vector<int> value_dims, const Mode &mode,
-                      const std::vector<std::string> &init_attrs,
-                      const std::string &entry_attr)
-      : value_names_(value_names), value_dims_(value_dims), mode_(mode) {
-    // for Initializer
-    for (size_t i = 0; i < value_names.size(); i++) {
-      auto name = value_names[i];
-      auto slices = string::split_string<std::string>(init_attrs[i], "&");
-
-      if (slices[0] == "gaussian_random") {
-        initializers_[name] = new GaussianInitializer(slices);
-      } else if (slices[0] == "fill_constant") {
-        initializers_[name] = new FillConstantInitializer(slices);
-      } else if (slices[0] == "uniform_random") {
-        initializers_[name] = new UniformInitializer(slices);
-      } else {
-        PADDLE_THROW(
-            platform::errors::InvalidArgument("%s can not be supported", name));
-      }
-    }
-
-    // for Entry
-    {
-      if (entry_attr == "none") {
-        entry_func_ =
-            std::bind(entry<std::string>, std::placeholders::_1, "none");
-      } else {
-        auto slices = string::split_string<std::string>(entry_attr, "&");
-        if (slices[0] == "count_filter") {
-          int threshold = std::stoi(slices[1]);
-          entry_func_ = std::bind(entry<int>, std::placeholders::_1, threshold);
-        } else if (slices[0] == "probability") {
-          float threshold = std::stof(slices[1]);
-          entry_func_ =
-              std::bind(entry<float>, std::placeholders::_1, threshold);
-        }
-      }
-    }
-
-    rwlock_.reset(new framework::RWLock);
-  }
-
-  ~ValueBlock() {
-    //    for (auto init : initializers_) {
-    //      delete init.second;
-    //      initializers_.erase(init.first);
-    //    }
-    //
-    //    for (auto value : values_) {
-    //      delete value.second;
-    //      values_.erase(value.first);
-    //    }
-  }
-
-  void Init(const int64_t &id, std::vector<std::vector<float>> *values,
-            int count) {
-    if (Has(id)) {
-      PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error"));
-    }
-
-    if (values->size() != value_names_.size()) {
-      PADDLE_THROW(
-          platform::errors::AlreadyExists("values can not match, error"));
-    }
-
-    auto value = new VALUE(value_names_);
-    value->set(values);
-    value->seen_after_last_save_ = true;
-    value->count_ = count;
-    values_[id] = value;
-  }
-
-  std::vector<std::vector<float> *> Get(
-      const int64_t &id, const std::vector<std::string> &value_names) {
-    rwlock_->RDLock();
-    auto ret_values = values_.at(id)->get(value_names);
-    rwlock_->UNLock();
-    return ret_values;
-  }
-
-  void InitFromInitializer(const int64_t &id,
-                           const std::vector<std::string> &value_names) {
-    rwlock_->WRLock();
-
-    if (Has(id)) {
-      Update(id);
-      rwlock_->UNLock();
-      return;
-    }
-
-    auto rets = std::vector<std::vector<float>>();
-    rets.resize(value_names_.size());
-
-    for (int i = 0; i < static_cast<int>(value_names_.size()); i++) {
-      auto name = value_names_[i];
-      auto *init = initializers_.at(name);
-
-      auto dim = value_dims_[i];
-      rets[i].resize(dim);
-
-      for (int j = 0; j < static_cast<int>(dim); j++) {
-        rets[i][j] = init->GetValue();
-      }
-    }
-
-    Init(id, &rets, 0);
-    Update(id);
-    rwlock_->UNLock();
-  }
-
-  bool GetEntry(const int64_t &id) {
-    rwlock_->RDLock();
-    auto value = values_.at(id);
-    auto entry = value->get_entry();
-    rwlock_->UNLock();
-    return entry;
-  }
-
-  void Set(const int64_t &id, const std::vector<std::string> &value_names,
-           const std::vector<std::vector<float>> &values) {
-    rwlock_->WRLock();
-    auto value = values_.at(id);
-    value->set(value_names, values);
-    rwlock_->UNLock();
-  }
-
-  void Update(const int64_t id) {
-    auto *value = values_.at(id);
-    value->reset_unseen_days();
-    auto count = value->fetch_count();
-
-    if (!value->get_entry()) {
-      value->set_entry(entry_func_(count));
-    }
-  }
-
- private:
-  bool Has(const int64_t id) {
-    auto got = values_.find(id);
-    if (got == values_.end()) {
-      return false;
-    } else {
-      return true;
-    }
-  }
-
- public:
-  std::unordered_map<int64_t, VALUE *> values_;
-
- private:
-  std::vector<std::string> value_names_;
-  std::vector<int> value_dims_;
-  Mode mode_;
-  std::function<bool(int64_t)> entry_func_;
-  std::unordered_map<std::string, Initializer *> initializers_;
-  std::unique_ptr<framework::RWLock> rwlock_{nullptr};
-};
-
-class SparseVariable {
- public:
-  explicit SparseVariable(const SparseMeta &meta) {
-    meta_.name = meta.name;
-    meta_.mode = meta.mode;
-    meta_.value_names = meta.value_names;
-    meta_.value_dims = meta.value_dims;
-    meta_.grad_name = meta.grad_name;
-    meta_.cached_varnames = meta.cached_varnames;
-    meta_.initializer_attrs = meta.initializer_attrs;
-    meta_.entry = meta.entry;
-
-    for (int i = 0; i < static_cast<int>(meta_.value_names.size()); i++) {
-      values_dims_[meta_.value_names[i]] = meta_.value_dims[i];
-    }
-
-    for (size_t i = 0; i < shard_num_; i++) {
-      auto block = std::make_shared<ValueBlock>(
-          meta.value_names, meta.value_dims, meta.mode, meta.initializer_attrs,
-          meta.entry);
-      shard_blocks_.emplace_back(block);
-    }
-
-    rwlock_.reset(new framework::RWLock);
-  }
-
-  void Init(const std::vector<int64_t> &ids) {
-    rwlock_->RDLock();
-    for (auto &id : ids) {
-      auto *block = GetShard(id);
-      block->InitFromInitializer(id, meta_.value_names);
-    }
-    rwlock_->UNLock();
-  }
-
-  void Get(const std::vector<int64_t> &ids,
-           const std::vector<std::string> &value_names,
-           std::vector<std::vector<std::vector<float> *>> *values) {
-    values->resize(ids.size());
-
-    auto buckets = bucket(ids.size(), 8);
-    std::vector<std::future<void>> fs;
-
-    for (int j = 0; j < 8; ++j) {
-      auto begin = buckets[j];
-      auto end = buckets[j + 1];
-
-      fs.push_back(
-          framework::Async([begin, end, &values, &ids, &value_names, this]() {
-            for (int x = begin; x < end; x++) {
-              auto id = ids[x];
-              auto *block = GetShard(id);
-              auto id_values = block->Get(id, value_names);
-              (*values)[x] = id_values;
-            }
-          }));
-    }
-
-    for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-  }
-
-  void GetEntry(const std::vector<int64_t> &ids, std::vector<int64_t> *values) {
-    auto buckets = bucket(ids.size(), 8);
-    std::vector<std::future<void>> fs;
-
-    for (int j = 0; j < 8; ++j) {
-      auto begin = buckets[j];
-      auto end = buckets[j + 1];
-
-      fs.push_back(framework::Async([begin, end, &values, &ids, this]() {
-        for (int x = begin; x < end; x++) {
-          auto id = ids[x];
-          auto *block = GetShard(id);
-          auto is_entry = block->GetEntry(id);
-
-          if (!is_entry) {
-            values->push_back(id);
-          }
-        }
-      }));
-    }
-    for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-  }
-
-  void Set(const std::vector<int64_t> &ids,
-           const std::vector<std::string> &value_names,
-           const std::vector<std::vector<std::vector<float>>> &values) {
-    for (int i = 0; i < static_cast<int>(ids.size()); i++) {
-      GetShard(ids[i])->Set(ids[i], value_names, values[i]);
-    }
-  }
-
-  void Dims(std::vector<std::string> value_names, std::vector<int64_t> *dims) {
-    for (auto &name : value_names) {
-      dims->push_back(values_dims_.at(name));
-    }
-  }
-
-  std::vector<std::string> CachedVarnames() const {
-    return meta_.cached_varnames;
-  }
-
-  void Load(const std::string &dirname) {
-    rwlock_->WRLock();
-    VLOG(1) << "load " << meta_.name << " from dir: " << dirname << " begin";
-
-    std::vector<std::string> filenames;
-    for (auto &value_name : meta_.value_names) {
-      auto filename = string::Sprintf("%s/%s", dirname, value_name);
-      filenames.push_back(filename);
-    }
-
-    LoadFromSelectedRows(filenames, meta_.value_names);
-    VLOG(1) << "load " << meta_.name << " in dir: " << dirname << " done";
-    rwlock_->UNLock();
-  }
-
-  void LoadFromSelectedRows(const std::vector<std::string> &filenames,
-                            const std::vector<std::string> &valuenames) {
-    std::vector<std::shared_ptr<framework::Variable>> variables;
-    auto place = platform::CPUPlace();
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto var = std::make_shared<framework::Variable>();
-      variables.push_back(var);
-      auto &filename = filenames[i];
-      std::ifstream fin(filename, std::ios::binary);
-      auto *selectedRows = var->GetMutable<framework::SelectedRows>();
-
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-
-      framework::DeserializeFromStream(fin, selectedRows, dev_ctx);
-      selectedRows->SyncIndex();
-    }
-
-    std::vector<const float *> tensors;
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto &slr = variables[i]->Get<framework::SelectedRows>();
-      auto src_t = slr.value();
-      const auto *value = src_t.data<float>();
-      tensors.push_back(value);
-    }
-
-    for (int i = 1; i < static_cast<int>(filenames.size()); i++) {
-      auto rows_0 = variables[0]->Get<framework::SelectedRows>().rows();
-      auto rows_i = variables[i]->Get<framework::SelectedRows>().rows();
-
-      bool is_equal = std::equal(rows_0.begin(), rows_0.end(), rows_i.begin());
-
-      if (!is_equal) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "%s and %s are not equal, can not be load rightly", filenames[0],
-            filenames[i]));
-      }
-    }
-
-    auto rows = variables[0]->Get<framework::SelectedRows>().rows();
-
-    for (auto i = 0; i < static_cast<int64_t>(rows.size()); i++) {
-      auto id = rows[i];
-      std::vector<std::vector<float>> values;
-      values.resize(filenames.size());
-
-      for (int j = 0; j < static_cast<int>(filenames.size()); ++j) {
-        values[j].resize(meta_.value_dims[j]);
-        std::memcpy(values[j].data(), tensors[j] + i * meta_.value_dims[j],
-                    sizeof(float) * meta_.value_dims[j]);
-      }
-
-      auto *block = GetShard(id);
-      block->Init(id, &values, 0);
-      block->Update(id);
-    }
-  }
-
-  void Save(const std::string &dirname, const int mode = 0) {
-    rwlock_->WRLock();
-    VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " begin";
-
-    MkDirRecursively(dirname.c_str());
-
-    std::vector<std::string> filenames;
-    for (auto &value_name : meta_.value_names) {
-      auto filename = string::Sprintf("%s/%s", dirname, value_name);
-      filenames.push_back(filename);
-    }
-
-    SaveToSelectedRows(filenames, meta_.value_names, mode);
-    VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " done";
-    rwlock_->UNLock();
-  }
-
-  void SaveToSelectedRows(const std::vector<std::string> &filenames,
-                          const std::vector<std::string> &valuenames,
-                          const int mode) {
-    for (auto &value_name : valuenames) {
-      auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(),
-                          value_name);
-      if (it == meta_.value_names.end()) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "[%s] is invalid param for [%s]", value_name, meta_.name));
-      }
-    }
-
-    auto place = platform::CPUPlace();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(place);
-
-    std::vector<int64_t> ids;
-
-    for (auto &block : shard_blocks_) {
-      for (auto value : block->values_) {
-        if (mode == 0) {
-          ids.push_back(value.first);
-        } else {
-          bool id_need_save = false;
-          // save all params
-          if (mode == 1) {
-            id_need_save = true;
-          } else {
-            id_need_save = value.second->seen_after_last_save_;
-          }
-
-          if (id_need_save) {
-            ids.push_back(value.first);
-          }
-          value.second->seen_after_last_save_ = false;
-        }
-      }
-    }
-
-    VLOG(3) << "save " << ids.size() << " feasigns for " << meta_.name
-            << " with mode: " << mode;
-
-    std::vector<std::shared_ptr<framework::Variable>> variables;
-    std::vector<float *> tensors;
-    std::vector<int64_t> dims;
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto dim = values_dims_.at(valuenames[i]);
-      auto var = std::make_shared<framework::Variable>();
-      auto *slr = var->GetMutable<framework::SelectedRows>();
-      auto *src_t = slr->mutable_value();
-
-      src_t->Resize({static_cast<int64_t>(ids.size()), dim});
-      auto *value = src_t->mutable_data<float>(place);
-
-      dims.push_back(dim);
-      variables.push_back(var);
-      tensors.push_back(value);
-    }
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    Get(ids, valuenames, &values);
-
-    int64_t offset = 0;
-    for (auto &vss : values) {
-      for (int i = 0; i < static_cast<int>(vss.size()); i++) {
-        auto &vs = vss[i];
-        std::memcpy(tensors[i] + offset * dims[i], vs->data(),
-                    sizeof(float) * dims[i]);
-      }
-      offset += 1;
-    }
-
-    for (auto &var : variables) {
-      auto *slr = var->GetMutable<framework::SelectedRows>();
-      slr->set_rows(ids);
-      slr->set_height(ids.size());
-    }
-
-    for (int i = 0; i < static_cast<int>(filenames.size()); i++) {
-      auto &filename = filenames[i];
-      auto &selectedRows = variables[i]->Get<framework::SelectedRows>();
-
-      std::ofstream fout(filename, std::ios::binary);
-      PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
-                        platform::errors::Unavailable(
-                            "Cannot open %s to save variables.", filename));
-
-      framework::SerializeToStream(fout, selectedRows, dev_ctx);
-      fout.close();
-    }
-  }
-
-  void SaveToText(const std::vector<std::string> &filenames,
-                  const std::vector<std::string> &valuenames) {
-    for (auto &value_name : valuenames) {
-      auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(),
-                          value_name);
-      if (it == meta_.value_names.end()) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "[%s] is invalid param for [%s]", value_name, meta_.name));
-      }
-    }
-
-    std::vector<std::unique_ptr<std::ofstream>> fouts;
-
-    for (auto filename : filenames) {
-      std::unique_ptr<std::ofstream> fout(new std::ofstream(filename));
-      fouts.push_back(std::move(fout));
-    }
-
-    for (auto &block : shard_blocks_) {
-      for (auto value : block->values_) {
-        std::vector<std::vector<float> *> vss = value.second->get(valuenames);
-
-        auto id = value.first;
-
-        for (int i = 0; i < static_cast<int>(vss.size()); i++) {
-          auto &vs = vss[i];
-          std::stringstream ss;
-          ss << id << "\t";
-          ss << vs->size() << "\t";
-          for (auto v : (*vs)) {
-            ss << v << " ";
-          }
-          ss << "\n";
-
-          fouts[i]->write(ss.str().c_str(), sizeof(char) * ss.str().size());
-        }
-      }
-    }
-
-    for (int i = 0; i < static_cast<int>(fouts.size()); i++) {
-      fouts[i]->close();
-    }
-  }
-
-  int64_t Size() {
-    int64_t cnt = 0;
-
-    for (auto &block : shard_blocks_) {
-      cnt += block->values_.size();
-    }
-    return cnt;
-  }
-
-  ValueBlock *GetShard(const int64_t id) {
-    return shard_blocks_[id & shard_mask_].get();
-  }
-
-  SparseMeta *GetMeta() { return &meta_; }
-
- private:
-  std::unique_ptr<framework::RWLock> rwlock_{nullptr};
-
-  SparseMeta meta_;
-  std::unordered_map<std::string, int64_t> values_dims_;
-  const size_t shard_mask_ = 127;
-  const size_t shard_num_ = 128;
-  std::vector<std::shared_ptr<ValueBlock>> shard_blocks_;
-};
-
-class LargeScaleKV {
- public:
-  LargeScaleKV() {}
-
-  explicit LargeScaleKV(const std::vector<SparseMeta> &table_metas) {
-    for (auto &sparse_meta : table_metas) {
-      auto table_name = sparse_meta.name;
-      auto meta = std::shared_ptr<SparseVariable>(
-          new SparseVariable(std::move(sparse_meta)));
-      sparse_variables[table_name] = meta;
-      grad_to_variables[sparse_meta.grad_name] = table_name;
-      grad_names_.push_back(sparse_meta.grad_name);
-    }
-  }
-
-  ~LargeScaleKV() {}
-
-  static std::shared_ptr<LargeScaleKV> GetInstantcePtr() { return scale_kv_; }
-
-  static LargeScaleKV *GetInstance() { return scale_kv_.get(); }
-
-  static LargeScaleKV *InitInstance(
-      const std::vector<SparseMeta> &table_metas) {
-    std::call_once(init_flag_, &LargeScaleKV::Init, table_metas);
-    return scale_kv_.get();
-  }
-
-  static void Init(const std::vector<SparseMeta> &table_metas) {
-    if (scale_kv_.get() == nullptr) {
-      scale_kv_.reset(new LargeScaleKV(table_metas));
-    }
-  }
-
-  SparseVariable *Get(const std::string &name) {
-    auto variable = sparse_variables.at(name);
-    return variable.get();
-  }
-
-  bool ParamInLargeScale(const std::string &name) {
-    auto got = sparse_variables.find(name);
-
-    if (got == sparse_variables.end()) {
-      return false;
-    }
-
-    return true;
-  }
-
-  bool GradInLargeScale(const std::string &name) {
-    auto got = grad_to_variables.find(name);
-
-    if (got == grad_to_variables.end()) {
-      return false;
-    }
-
-    return true;
-  }
-
-  SparseVariable *GetByGrad(const std::string &name) {
-    return Get(grad_to_variables[name]);
-  }
-
-  const std::vector<std::string> &GetAllGrads() { return grad_names_; }
-
- private:
-  std::unordered_map<std::string, std::shared_ptr<SparseVariable>>
-      sparse_variables;
-  std::unordered_map<std::string, std::string> grad_to_variables;
-  std::vector<std::string> grad_names_;
-  static std::shared_ptr<LargeScaleKV> scale_kv_;
-  static std::once_flag init_flag_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc
deleted file mode 100644
index 558d70e5c3353f98c052cfbcd108f859ea03e7a3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_prefetch.cc
+++ /dev/null
@@ -1,311 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include <memory>
-#include <set>
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient;
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-static void SplitIdsIntoMultipleVarsBySection(
-    const std::vector<int64_t> &in_ids,
-    const std::vector<std::string> &in_varnames, const int tables,
-    const int pservers, const bool is_distibuted, framework::Scope *scope,
-    std::vector<std::vector<int64_t>> *splited_ids,
-    std::vector<std::vector<int64_t>> *origin_ids) {
-  PADDLE_ENFORCE_EQ(
-      in_varnames.size(), tables,
-      platform::errors::OutOfRange(
-          "send varnames size: %d not equal table number: %d, internal error",
-          in_varnames.size(), tables));
-
-  PADDLE_ENFORCE_LE(
-      tables, pservers,
-      platform::errors::OutOfRange("table number %d not equal or less than "
-                                   "pserver number: %d, internal error",
-                                   tables, pservers));
-
-  auto place = platform::CPUPlace();
-
-  std::set<int64_t> st(in_ids.begin(), in_ids.end());
-  std::vector<int64_t> all_ids;
-  all_ids.assign(st.begin(), st.end());
-
-  splited_ids->resize(tables);
-  origin_ids->resize(tables);
-
-  if (is_distibuted) {
-    for (auto &id : all_ids) {
-      auto pserver_id = id % pservers;
-      (*splited_ids)[pserver_id].push_back(id);
-      (*origin_ids)[pserver_id].push_back(id);
-    }
-  } else {
-    for (auto &id : all_ids) {
-      auto pserver_id = id % pservers;
-      (*origin_ids)[pserver_id].push_back(id);
-      id = id / pservers;
-      (*splited_ids)[pserver_id].push_back(id);
-    }
-  }
-
-  for (size_t i = 0; i < in_varnames.size(); ++i) {
-    auto *id_tensor =
-        scope->Var(in_varnames[i])->GetMutable<framework::LoDTensor>();
-
-    auto &ids = (*splited_ids)[i];
-    if (!ids.empty()) {
-      auto *id_tensor_data = id_tensor->mutable_data<int64_t>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
-    }
-  }
-}
-
-typedef std::vector<std::pair<std::string, std::string>> TableAndEndpoints;
-
-void prefetch_core(
-    const std::vector<int64_t> &ids, const TableAndEndpoints &tables,
-    const framework::ExecutionContext &context, const framework::Scope &scope,
-    const bool is_distributed,
-    std::unordered_map<int64_t, std::vector<float>> *recved_vec_map) {
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-          context.Attr<int>("trainer_id"));
-
-  int pservers = context.Attr<int>("pserver_num");
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &actual_ctx = *pool.Get(platform::CPUPlace());
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (size_t i = 0; i < tables.size(); ++i) {
-    in_var_names.push_back("prefetch_send@" + tables[i].second);
-    out_var_names.push_back("prefetch_recv@" + tables[i].second);
-  }
-
-  std::vector<std::vector<int64_t>> split_ids;
-  std::vector<std::vector<int64_t>> origin_ids;
-  SplitIdsIntoMultipleVarsBySection(ids, in_var_names, tables.size(), pservers,
-                                    is_distributed, local_scope.get(),
-                                    &split_ids, &origin_ids);
-
-  // create output var in local scope
-  for (auto &name : out_var_names) {
-    local_scope->Var(name)->GetMutable<framework::LoDTensor>();
-  }
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < in_var_names.size(); i++) {
-    if (NeedSend(*local_scope.get(), in_var_names[i])) {
-      VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second
-              << " to get " << out_var_names[i] << " back";
-      rets.push_back(rpc_client->AsyncPrefetchVar(
-          tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i],
-          out_var_names[i], tables[i].first));
-    } else {
-      VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
-    }
-  }
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                               "internal error in RPCClient"));
-  }
-
-  for (size_t o_idx = 0; o_idx < out_var_names.size(); ++o_idx) {
-    auto &ids_in_this_section = origin_ids[o_idx];
-
-    if (!ids_in_this_section.empty()) {
-      auto &prefetch_out_var =
-          local_scope->Var(out_var_names[o_idx])->Get<framework::LoDTensor>();
-      const auto *out_var_data = prefetch_out_var.data<float>();
-      auto &dims = prefetch_out_var.dims();
-
-      PADDLE_ENFORCE_EQ(dims.size(), 2,
-                        platform::errors::InvalidArgument(
-                            "The size of Tensor dims must be 2."));
-      PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0],
-                        platform::errors::InvalidArgument(
-                            "The size of ids in this section must equal to "
-                            "dims[0]: %s, but got %s",
-                            dims[0], ids_in_this_section.size()));
-
-      auto row_numel = dims[1];
-
-      for (int64_t i = 0; i < dims[0]; ++i) {
-        auto origin_id = ids_in_this_section[i];
-        std::vector<float> vecs(row_numel);
-
-        std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin());
-        (*recved_vec_map)[origin_id] = vecs;
-      }
-    } else {
-      VLOG(3) << "ids in this section is empty";
-    }
-  }
-}
-
-void prefetch(const std::string &id_name, const std::string &out_name,
-              const std::string &persistable_var_name,
-              const bool is_distributed,
-              const std::vector<std::string> &table_names,
-              const std::vector<std::string> &endpoints,
-              const framework::ExecutionContext &context,
-              const framework::Scope &scope) {
-  prefetchs({id_name}, {out_name}, persistable_var_name, is_distributed,
-            table_names, endpoints, context, scope);
-}
-
-void prefetchs(const std::vector<std::string> &id_var_names,
-               const std::vector<std::string> &out_var_names,
-               const std::string &persistable_var_name,
-               const bool is_distributed,
-               const std::vector<std::string> &table_names,
-               const std::vector<std::string> &endpoints,
-               const framework::ExecutionContext &context,
-               const framework::Scope &scope) {
-  auto vec_dim_1 = 0;
-  auto vec_dim_0 = 0;
-  framework::Variable *var = scope.FindVar(persistable_var_name);
-
-  if (var->IsType<SelectedRows>()) {
-    vec_dim_1 = var->Get<framework::SelectedRows>().value().dims()[1];
-  } else {
-    vec_dim_0 = var->Get<framework::LoDTensor>().dims()[0];
-    vec_dim_1 = var->Get<framework::LoDTensor>().dims()[1];
-  }
-
-  PADDLE_ENFORCE_GT(vec_dim_1, 0,
-                    platform::errors::InvalidArgument(
-                        "lookup table var's dim must gather than 0"));
-
-  const auto place =
-      scope.FindVar(id_var_names[0])->Get<framework::LoDTensor>().place();
-
-  std::vector<std::vector<int64_t>> ids_group;
-  std::vector<int64_t> ids_union;
-  std::vector<framework::LoD> ids_lods;
-  TableAndEndpoints tables;
-
-  for (auto &id_name : id_var_names) {
-    auto &id_tensor = scope.FindVar(id_name)->Get<framework::LoDTensor>();
-    std::vector<int64_t> ids;
-    TensorToVector(id_tensor, context.device_context(), &ids);
-    ids_union.insert(ids_union.end(), ids.begin(), ids.end());
-    ids_group.push_back(ids);
-    ids_lods.push_back(id_tensor.lod());
-  }
-
-  std::unordered_set<int64_t> s(ids_union.begin(), ids_union.end());
-  ids_union.assign(s.begin(), s.end());
-
-  for (auto &i : ids_union) {
-    PADDLE_ENFORCE_GE(
-        i, 0, platform::errors::OutOfRange(
-                  "each element in embedding should be larger or equal 0"));
-    if (!is_distributed) {
-      PADDLE_ENFORCE_LT(
-          i, vec_dim_0,
-          platform::errors::OutOfRange(
-              "embedding id must in [0, %d) when is_distributed False",
-              vec_dim_0));
-    }
-  }
-
-  for (size_t i = 0; i < table_names.size(); i++) {
-    tables.push_back(std::make_pair(table_names[i], endpoints[i]));
-  }
-  std::unordered_map<int64_t, std::vector<float>> recved_vec_map;
-  prefetch_core(ids_union, tables, context, scope, is_distributed,
-                &recved_vec_map);
-
-  auto padding_idx = distributed::kNoPadding;
-
-  if (context.HasAttr("padding_idx")) {
-    padding_idx = context.Attr<int64_t>("padding_idx");
-  }
-
-  for (size_t i = 0; i < out_var_names.size(); i++) {
-    std::vector<int64_t> ids = ids_group[i];
-    auto ids_size = ids.size();
-    auto *out_t =
-        scope.FindVar(out_var_names[i])->GetMutable<framework::LoDTensor>();
-    out_t->set_lod(ids_lods[i]);
-    out_t->Resize(
-        framework::make_ddim({static_cast<int64_t>(ids_size), vec_dim_1}));
-    auto *out_d = out_t->mutable_data<float>(place);
-
-    if (platform::is_cpu_place(out_t->place())) {
-      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
-        const auto &id = ids[idx];
-        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-          memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1);
-        } else {
-          std::copy_n(recved_vec_map[id].begin(), vec_dim_1,
-                      out_d + idx * vec_dim_1);
-        }
-      }
-    } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      std::vector<float> ids_value_vec(ids_size * vec_dim_1);
-      for (auto idx = 0; idx < static_cast<int>(ids_size); idx++) {
-        const auto &id = ids[idx];
-        if (padding_idx != distributed::kNoPadding && id == padding_idx) {
-          memset(&ids_value_vec[idx * vec_dim_1], 0, sizeof(float) * vec_dim_1);
-        } else {
-          memcpy(&ids_value_vec[idx * vec_dim_1], &recved_vec_map[id][0],
-                 sizeof(float) * vec_dim_1);
-        }
-      }
-      auto &gpu_place = BOOST_GET_CONST(platform::CUDAPlace, out_t->place());
-      auto &cpu_place = BOOST_GET_CONST(
-          platform::CPUPlace, paddle::platform::CPUDeviceContext().GetPlace());
-      auto stream = context.cuda_device_context().stream();
-      memory::Copy(gpu_place, out_d, cpu_place, &ids_value_vec[0],
-                   sizeof(float) * ids_size * vec_dim_1, stream);
-#else
-      PADDLE_ENFORCE(true, platform::errors::PermissionDenied(
-                               "Paddle is not compiled with GPU!"));
-#endif
-    }
-  }
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h
deleted file mode 100644
index 6fd3a998813c0ba32b8b694b6655e1c73f45d62b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_prefetch.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace framework {
-class ExecutionContext;
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-constexpr int64_t kNoPadding = -1;
-
-void prefetchs(const std::vector<std::string>& id_var_names,
-               const std::vector<std::string>& out_var_names,
-               const std::string& persistable_var_name, const bool backfill,
-               const std::vector<std::string>& table_names,
-               const std::vector<std::string>& endpoints,
-               const framework::ExecutionContext& context,
-               const framework::Scope& scope);
-
-void prefetch(const std::string& id_name, const std::string& out_name,
-              const std::string& persistable_var_name, const bool backfill,
-              const std::vector<std::string>& table_names,
-              const std::vector<std::string>& endpoints,
-              const framework::ExecutionContext& context,
-              const framework::Scope& scope);
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
deleted file mode 100644
index d5d3c9c3c7c48fa162e18823bb237901e064315c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ /dev/null
@@ -1,248 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <sys/types.h>
-#include <algorithm>
-#include <memory>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient;
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-template <typename T>
-void RecvSparseLodTensor(const CommContext &rpc_ctx,
-                         const framework::Scope &scope) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto &cpu_ctx = *pool.Get(cpu_place);
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-  std::vector<const float *> tensors;
-  std::vector<distributed::VarHandlePtr> rets;
-  std::vector<std::string> recv_varnames;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
-    local_scope->Var(recv_var_name);
-    // sparse param in recv_scope is LoDTensor
-    rets.push_back(rpc_client->AsyncGetVarNoBarrier(
-        rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name,
-        recv_var_name));
-    recv_varnames.push_back(recv_var_name);
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                               "internal error in RPCClient"));
-    auto &recv_var_name = recv_varnames[i];
-    auto *local_var = local_scope->FindVar(recv_var_name);
-    const auto *value = local_var->Get<framework::LoDTensor>().data<float>();
-    tensors.push_back(value);
-  }
-
-  auto *merged_var = scope.FindVar(rpc_ctx.var_name);
-
-  if (merged_var == nullptr || !merged_var->IsInitialized()) {
-    PADDLE_THROW(
-        platform::errors::InvalidArgument("%s must initialized at first."));
-  }
-  auto dims1 = merged_var->Get<framework::LoDTensor>().dims()[1];
-  int64_t height = 0;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]);
-    height += splited_var->Get<framework::LoDTensor>().dims()[0];
-  }
-
-  PADDLE_ENFORCE_EQ(
-      merged_var->Get<framework::LoDTensor>().dims()[0], height,
-      platform::errors::InvalidArgument(
-          "Received variable must has same dimension with local variable."));
-
-  auto *merged_t = merged_var->GetMutable<framework::LoDTensor>();
-  auto *merged_d = merged_t->mutable_data<float>(cpu_place);
-
-  auto pserver_num = rpc_ctx.splited_varnames.size();
-  for (int x = 0; x < height; ++x) {
-    auto id = x % pserver_num;
-    auto idx = x / pserver_num;
-    std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1,
-                sizeof(float) * dims1);
-  }
-}
-
-template <typename T>
-void RecvGeoSparseRecords(const CommContext &rpc_ctx,
-                          const framework::Scope &scope) {
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto &cpu_ctx = *pool.Get(cpu_place);
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  std::vector<distributed::VarHandlePtr> rets;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    local_scope->Var(recv_var_name);
-    VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
-    // sparse param in recv_scope is LoDTensor
-    rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
-                                           *local_scope.get(), recv_var_name,
-                                           recv_var_name, recv_var_name));
-  }
-
-  for (size_t i = 0; i < rets.size(); i++) {
-    PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                               "internal error in RPCClient"));
-  }
-
-  int64_t height = 0;
-  int64_t ids_num = 0;
-  int64_t width = 0;
-
-  std::vector<int64_t> all_ids;
-  auto pserver_num = rpc_ctx.splited_varnames.size();
-
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    auto *recv_var = local_scope->FindVar(recv_var_name);
-    auto &recv_t = recv_var->Get<framework::SelectedRows>();
-
-    height += recv_t.height();
-    ids_num += recv_t.rows().size();
-    width = recv_t.value().dims()[1];
-
-    if (rpc_ctx.is_distributed) {
-      std::copy(recv_t.rows().begin(), recv_t.rows().end(),
-                std::back_inserter(all_ids));
-    } else {
-      std::transform(recv_t.rows().begin(), recv_t.rows().end(),
-                     std::back_inserter(all_ids),
-                     [&](int64_t id) { return id * pserver_num + i; });
-    }
-  }
-
-  auto *var = scope.FindVar(rpc_ctx.var_name);
-  auto *t_ = var->GetMutable<framework::SelectedRows>();
-  T *out_data =
-      t_->mutable_value()->mutable_data<T>({ids_num, width}, cpu_place);
-  t_->set_height(height);
-  t_->set_rows(all_ids);
-
-  int64_t cnt = 0;
-  for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-    auto &recv_var_name = rpc_ctx.splited_varnames[i];
-    auto *recv_var = local_scope->FindVar(recv_var_name);
-    auto &recv_t = recv_var->Get<framework::SelectedRows>();
-
-    auto rows = recv_t.rows().size();
-    const T *in_data = recv_t.value().data<T>();
-    std::copy_n(in_data, rows * width, out_data + cnt);
-    cnt += rows * width;
-  }
-  t_->SyncIndex();
-}
-
-template <typename T>
-void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) {
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::vector<distributed::VarHandlePtr> rets;
-
-  // variable do not spilt
-  if (rpc_ctx.origin_varnames.size() == 1 &&
-      rpc_ctx.splited_varnames.size() == 1) {
-    auto varname = rpc_ctx.origin_varnames[0];
-    const auto place =
-        scope.FindVar(varname)->Get<framework::LoDTensor>().place();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(place);
-    VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? "
-            << platform::is_gpu_place(place);
-    rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx,
-                                                    scope, varname, varname));
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(
-          rets[i]->Wait(), 0U,
-          platform::errors::ExecutionTimeout("internal error in RPCClient"));
-    }
-
-    VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
-    return;
-  } else {
-    PADDLE_ENFORCE(false, platform::errors::Unimplemented(
-                              "ParameterRecv can not recv dense with multi "
-                              "parts now, add it soon."));
-  }
-}
-
-template <typename T>
-void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope,
-                                  bool geo_records) {
-  VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
-
-  PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1,
-                    platform::errors::InvalidArgument(
-                        "origin_varnames.size() >= 1 is permitted"));
-
-  if (rpc_ctx.is_sparse) {
-    if (geo_records) {
-      RecvGeoSparseRecords<T>(rpc_ctx, scope);
-    } else {
-      RecvSparseLodTensor<T>(rpc_ctx, scope);
-    }
-  } else {
-    RecvLodTensor<T>(rpc_ctx, scope);
-  }
-
-  VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
-}
-template <typename T>
-void ParameterRecv<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope) {
-  this->operator()(rpc_ctx, scope, false);
-}
-
-template struct ParameterRecv<float>;
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h
deleted file mode 100644
index c30d21aa791e23cdebfb35135a292ad846c2576c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_recv.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-template <typename T>
-struct ParameterRecv {
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope,
-                  bool barrier);
-
-  void operator()(const CommContext &rpc_ctx, const framework::Scope &scope);
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
deleted file mode 100644
index 109514ca2541c35b515a357108cd303ae0eeff91..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include <memory>
-#include <utility>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class Tensor;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient;
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-using DDim = framework::DDim;
-
-typedef std::vector<std::pair<std::string, std::string>> EP_SPLIT_TABLE_PAIRS;
-
-inline EP_SPLIT_TABLE_PAIRS GetMultiFieldCommContext(
-    const CommContext &rpc_ctx, const framework::Scope &scope,
-    int multi_parts) {
-  EP_SPLIT_TABLE_PAIRS table_pairs;
-
-  auto *send_var = scope.FindVar(rpc_ctx.var_name);
-  if (send_var->IsType<framework::SelectedRows>()) {
-    PADDLE_ENFORCE_GE(multi_parts, 1,
-                      platform::errors::InvalidArgument(
-                          "multi_parts must == 1 in parameter send, now is: %d",
-                          multi_parts));
-
-    for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-      table_pairs.push_back(
-          std::make_pair(rpc_ctx.epmap[i], rpc_ctx.splited_varnames[i]));
-    }
-
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "GetMultiFieldCommContext unsupported LoDTensor current!"));
-  }
-
-  return table_pairs;
-}  // namespace distributed
-
-void SendByNotifyRPC(const CommContext &rpc_ctx,
-                     const framework::Scope &scope) {
-  auto cpu_ctx = paddle::platform::CPUDeviceContext();
-  auto &send_var_name = rpc_ctx.var_name;
-  std::vector<distributed::VarHandlePtr> rets;
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  if (NeedSend(scope, send_var_name)) {
-    for (size_t j = 0; j < rpc_ctx.epmap.size(); j++) {
-      auto &endpoint = rpc_ctx.epmap[j];
-      VLOG(4) << "sending " << send_var_name << " to " << endpoint;
-      rets.push_back(rpc_client->AsyncDistributeNotify(endpoint, cpu_ctx, scope,
-                                                       send_var_name));
-      VLOG(4) << "send var " << send_var_name << " by notify RPC done";
-    }
-  } else {
-    VLOG(3) << "don't send non-initialized variable: " << rpc_ctx.var_name;
-  }
-
-  for (auto &handle : rets) {
-    PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                              "internal error in RPCClient"));
-  }
-}
-
-template <typename T>
-void ParameterSend<T>::operator()(const CommContext &rpc_ctx,
-                                  const framework::Scope &scope, bool sync,
-                                  int multi_parts) {
-  if (rpc_ctx.var_name == STEP_COUNTER) {
-    SendByNotifyRPC(rpc_ctx, scope);
-    return;
-  }
-
-  std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
-
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &cpu_ctx = *pool.Get(platform::CPUPlace());
-
-  distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
-
-  std::vector<distributed::VarHandlePtr> rets;
-  auto *send_var = scope.FindVar(rpc_ctx.var_name);
-
-  if (send_var->IsType<framework::LoDTensor>()) {
-    size_t out_num = rpc_ctx.splited_varnames.size();
-    if (out_num > 1) {
-      auto &send_tensor = send_var->Get<framework::LoDTensor>();
-      auto &send_tensor_dims = send_tensor.dims();
-      std::vector<framework::DDim> outs_dims;
-      outs_dims.reserve(out_num);
-
-      // infer output shape
-      PADDLE_ENFORCE_EQ(
-          rpc_ctx.height_sections.size(), out_num,
-          platform::errors::InvalidArgument("tensor split sections size"
-                                            "should be equal to output size."));
-      for (size_t i = 0; i < out_num; ++i) {
-        auto dim = send_tensor_dims;
-        dim[0] = rpc_ctx.height_sections[i];
-        outs_dims.push_back(dim);
-      }
-
-      // create output var in local scope
-      size_t row_offset = 0;
-      for (size_t i = 0; i < out_num; ++i) {
-        framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[i])
-                                     ->GetMutable<framework::LoDTensor>();
-        *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]);
-        row_offset += outs_dims[i][0];
-      }
-    } else {
-      auto &send_tensor = send_var->Get<framework::LoDTensor>();
-      framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[0])
-                                   ->GetMutable<framework::LoDTensor>();
-      out->ShareDataWith(send_tensor);
-    }
-
-    for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) {
-      auto &send_var_name = rpc_ctx.splited_varnames[i];
-      auto &endpoint = rpc_ctx.epmap[i];
-      VLOG(4) << " send var name: " << send_var_name
-              << "endpoint: " << endpoint;
-      if (NeedSend(*local_scope.get(), send_var_name)) {
-        VLOG(3) << "sending " << send_var_name << " to " << endpoint;
-        rets.push_back(rpc_client->AsyncSendVar(
-            endpoint, cpu_ctx, *local_scope.get(), send_var_name));
-        VLOG(4) << "send var " << send_var_name << " async handle done";
-      } else {
-        VLOG(3) << "don't send non-initialized variable: "
-                << rpc_ctx.splited_varnames[i];
-      }
-    }
-  } else if (send_var->IsType<framework::SelectedRows>()) {
-    auto &send_slr = send_var->Get<framework::SelectedRows>();
-
-    auto &send_rows = send_slr.rows();
-    if (send_rows.size() == 0) {
-      LOG(WARNING)
-          << "WARNING: The variable sent to pserver is empty, which "
-             "may cause an unknown error. Please check the state of "
-             "use_double_buffer in pyreader/dataloader async mode, you need to "
-             "turn it false.";
-    }
-
-    std::vector<std::vector<size_t>> outs_rows_idx;
-    std::vector<std::vector<size_t>> outs_dense_idx;
-
-    auto table_pairs = GetMultiFieldCommContext(rpc_ctx, scope, 1);
-    outs_rows_idx.resize(table_pairs.size());
-    outs_dense_idx.resize(table_pairs.size());
-
-    auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0];
-    auto *src = send_slr.value().data<T>();
-
-    // create output var in local scope
-    std::vector<framework::SelectedRows *> outs;
-    for (auto &table : table_pairs) {
-      auto *out =
-          local_scope->Var(table.second)->GetMutable<framework::SelectedRows>();
-      outs.push_back(out);
-    }
-
-    if (!rpc_ctx.is_distributed) {
-      auto pserver_num = rpc_ctx.epmap.size();
-
-      // split rows index into output sparse vars
-      for (size_t i = 0; i < send_rows.size(); ++i) {
-        auto ep_idx = send_rows[i] % pserver_num;
-        auto id = send_rows[i] / pserver_num;
-        outs_rows_idx[ep_idx].push_back(id);
-        outs_dense_idx[ep_idx].push_back(i);
-      }
-
-      auto place = platform::CPUPlace();
-
-      for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size();
-           out_idx++) {
-        auto rows_idx = outs_rows_idx[out_idx];
-
-        auto dims = send_slr.GetCompleteDims();
-        dims[0] = rows_idx.size();
-        outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]);
-        outs[out_idx]->mutable_rows()->clear();
-        outs[out_idx]->mutable_value()->mutable_data<T>(dims, send_slr.place());
-
-        if (rows_idx.size() > 0) {
-          for (auto idx : rows_idx) {
-            outs[out_idx]->mutable_rows()->push_back(idx);
-          }
-          auto dst = outs[out_idx]->mutable_value()->mutable_data<T>(place);
-          for (size_t j = 0; j < rows_idx.size(); j++) {
-            if (platform::is_cpu_place(place)) {
-              memory::Copy(platform::CPUPlace(), dst + j * row_numel,
-                           platform::CPUPlace(),
-                           src + outs_dense_idx[out_idx][j] * row_numel,
-                           sizeof(T) * row_numel);
-            } else {
-              PADDLE_THROW(
-                  platform::errors::Unimplemented("do not support GPU now"));
-            }
-          }
-        }
-        PADDLE_ENFORCE_EQ(
-            rows_idx.size(), outs[out_idx]->rows().size(),
-            platform::errors::InvalidArgument(
-                "rows should has the same size with tensor dim 0"));
-      }
-    } else {
-      auto pserver_num = rpc_ctx.epmap.size();
-
-      // split rows index into output sparse vars
-      for (size_t i = 0; i < send_rows.size(); ++i) {
-        auto out_idx = send_rows[i] % pserver_num;
-        outs_rows_idx[out_idx].push_back(send_rows[i]);
-        outs_dense_idx[out_idx].push_back(i);
-      }
-
-      auto place = platform::CPUPlace();
-
-      for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size();
-           out_idx++) {
-        auto rows_idx = outs_rows_idx[out_idx];
-
-        auto dims = send_slr.GetCompleteDims();
-        dims[0] = rows_idx.size();
-
-        outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]);
-        outs[out_idx]->mutable_rows()->clear();
-        outs[out_idx]->mutable_value()->mutable_data<T>(dims, send_slr.place());
-
-        if (rows_idx.size() > 0) {
-          for (auto idx : rows_idx) {
-            outs[out_idx]->mutable_rows()->push_back(idx);
-          }
-          auto dst = outs[out_idx]->mutable_value()->mutable_data<T>(place);
-          for (size_t j = 0; j < rows_idx.size(); j++) {
-            if (platform::is_cpu_place(place)) {
-              memory::Copy(platform::CPUPlace(), dst + j * row_numel,
-                           platform::CPUPlace(),
-                           src + outs_dense_idx[out_idx][j] * row_numel,
-                           sizeof(T) * row_numel);
-            } else {
-              PADDLE_THROW(
-                  platform::errors::Unimplemented("do not support GPU now"));
-            }
-          }
-        }
-        PADDLE_ENFORCE_EQ(
-            rows_idx.size(), outs[out_idx]->rows().size(),
-            platform::errors::InvalidArgument(
-                "rows should has the same size with tensor dim 0"));
-      }
-    }
-
-    for (size_t i = 0; i < table_pairs.size(); i++) {
-      auto &send_var_name = table_pairs[i].second;
-      auto &endpoint = table_pairs[i].first;
-      auto need_send = NeedSend(*local_scope.get(), send_var_name);
-
-      VLOG(4) << "send var name: " << send_var_name
-              << " send var endpoint: " << endpoint
-              << " need send: " << need_send;
-
-      if (need_send) {
-        VLOG(4) << "sending " << send_var_name << " to " << endpoint;
-
-        rets.push_back(rpc_client->AsyncSendVar(
-            endpoint, cpu_ctx, *local_scope.get(), send_var_name));
-        VLOG(4) << "send var " << send_var_name << " async handle done";
-      } else {
-        VLOG(4) << "don't send non-initialized variable: "
-                << rpc_ctx.splited_varnames[i];
-      }
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "unsupported var type: %s to send!", send_var->Type()));
-  }
-
-  VLOG(4) << "Prepare to send var " << rpc_ctx.var_name;
-  if (sync) {
-    for (auto &handle : rets) {
-      VLOG(4) << "Wait send var to pserver handle: " << handle;
-      PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout(
-                                                "internal error in RPCClient"));
-    }
-  }
-}
-
-template struct ParameterSend<float>;
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h
deleted file mode 100644
index cedc98b1fcadd4263bf3aaaec3dd0047c9fb4b36..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// NOTE: This file was originally created by tensorflow
-//       (https://github.com/tensorflow/tensorflow/) we borrow this
-//       file and did some modifications so that we can send gRPC
-//       requests without too much copying of the tensor data.
-
-#pragma once
-
-#include <string>
-
-#include "grpc++/grpc++.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-char* EncodeVarint32(char* dst, uint32_t v) {
-  // Operate on characters as unsigneds
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  static const int B = 128;
-  if (v < (1 << 7)) {
-    *(ptr++) = v;
-  } else if (v < (1 << 14)) {
-    *(ptr++) = v | B;
-    *(ptr++) = v >> 7;
-  } else if (v < (1 << 21)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = v >> 14;
-  } else if (v < (1 << 28)) {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = v >> 21;
-  } else {
-    *(ptr++) = v | B;
-    *(ptr++) = (v >> 7) | B;
-    *(ptr++) = (v >> 14) | B;
-    *(ptr++) = (v >> 21) | B;
-    *(ptr++) = v >> 28;
-  }
-  return reinterpret_cast<char*>(ptr);
-}
-
-char* EncodeVarint64(char* dst, uint64_t v) {
-  static const int B = 128;
-  unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
-  while (v >= B) {
-    *(ptr++) = (v & (B - 1)) | B;
-    v >>= 7;
-  }
-  *(ptr++) = static_cast<unsigned char>(v);
-  return reinterpret_cast<char*>(ptr);
-}
-
-int VarintLength(uint64_t v) {
-  int len = 1;
-  while (v >= 128) {
-    v >>= 7;
-    len++;
-  }
-  return len;
-}
-
-class ProtoEncodeHelper {
- public:
-  ProtoEncodeHelper(char* buf, int max_size)
-      : base_(buf), p_(buf), limit_(base_ + max_size) {}
-
-  ~ProtoEncodeHelper() {}
-
-  const char* data() const { return base_; }
-  size_t size() const { return p_ - base_; }
-
-  void WriteUint64(int tag, uint64_t v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    Encode64(v);
-  }
-  void WriteBool(int tag, bool v) {
-    Encode32(combine(tag, WIRETYPE_VARINT));
-    EncodeBool(v);
-  }
-  void WriteString(int tag, const std::string& v) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(v.size());
-    EncodeBytes(v.data(), v.size());
-  }
-  void WriteVarlengthBeginning(int tag, uint32_t len) {
-    Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
-    Encode32(len);
-  }
-  void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
-
- private:
-  // Note: this module's behavior must match the protocol buffer wire encoding
-  // format.
-  enum {
-    WIRETYPE_VARINT = 0,
-    WIRETYPE_LENGTH_DELIMITED = 2,
-  };
-  static uint32_t combine(uint32_t tag, uint32_t type) {
-    return ((tag << 3) | type);
-  }
-  inline void Encode32(uint32_t v) {
-    if (v < 128) {
-      // Fast path for single-byte values.  Many of the calls will use a
-      // constant value for v, so the comparison will get optimized away
-      // when Encode32 is inlined into the caller.
-      *p_ = v;
-      p_++;
-    } else {
-      p_ = EncodeVarint32(p_, v);
-    }
-  }
-  void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
-  void EncodeBool(bool v) {
-    *p_ = (v ? 1 : 0);  // Equal to varint32 encoding of 0 or 1
-    p_++;
-  }
-  void EncodeBytes(const char* bytes, int N) {
-    memcpy(p_, bytes, N);
-    p_ += N;
-  }
-
-  char* base_;
-  char* p_;
-  char* limit_;  // Just for CHECKs
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
deleted file mode 100644
index 44359af1b1b2a6a161adcc83b97ea5fad96eecb0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ /dev/null
@@ -1,261 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-#include <condition_variable>  // NOLINT
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-constexpr char kRequestSend[] = "RequestSend";
-constexpr char kRequestGet[] = "RequestGet";
-constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable";
-constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier";
-constexpr char kRequestPrefetch[] = "RequestPrefetch";
-constexpr char kRequestCheckpoint[] = "RequestCheckpoint";
-constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
-constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier";
-constexpr char kRequestNotify[] = "RequestNotify";
-constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv";
-
-constexpr char kSendRPC[] = "SendRPC";
-constexpr char kGetRPC[] = "GetRPC";
-constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC";
-constexpr char kGetMonomerRPC[] = "GetMonomerRPC";
-constexpr char kPrefetchRPC[] = "PrefetchRPC";
-constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC";
-constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC";
-constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC";
-constexpr char kSendCompleteRPC[] = "SendCompleteRPC";
-constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC";
-constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC";
-constexpr int64_t kPrefetchTimeout = 60000;
-
-#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
-#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
-#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
-#define COMPLETE_MESSAGE "COMPLETE@RECV"
-#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV"
-#define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@"
-#define STEP_COUNTER "@PS_STEP_COUNTER@"
-
-#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY"
-#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY"
-
-enum DistributedMode { kSync = 0, kAsync = 1, kHalfAsync = 2, kGeo = 3 };
-
-class RPCServer;
-
-class VarHandle {
- public:
-  VarHandle(const std::string ep, const std::string& method,
-            const std::string& name,
-            const platform::DeviceContext* p_ctx = nullptr,
-            const framework::Scope* p_scope = nullptr)
-      : status_(kDefaultState) {
-    ep_ = ep;
-    ctx_ = p_ctx;
-    scope_ = p_scope;
-    name_ = name;
-    method_ = method;
-  }
-
-  virtual ~VarHandle() {}
-
- public:
-  bool should_retry = false;
-
-  bool Wait() {
-    int ret = kDefaultState;
-    {
-      std::unique_lock<std::mutex> lk(sync_mutex_);
-      wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
-      ret = status_;
-    }
-    VLOG(7) << "VarHandle wait:" << ret;
-    return ret != kErrorState;
-  }
-
-  void Finish(bool ok) {
-    {
-      std::unique_lock<std::mutex> lk(sync_mutex_);
-      status_ = ok ? kFinishState : kErrorState;
-    }
-    VLOG(7) << "VarHandle finish:" << ok;
-    wait_cond_.notify_all();
-  }
-
-  std::string String() const {
-    std::ostringstream s;
-    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:["
-      << status_ << "]";
-    return s.str();
-  }
-
-  std::string ep() const { return ep_; }
-  const platform::DeviceContext* ctx() const { return ctx_; }
-  const framework::Scope* scope() const { return scope_; }
-  std::string name() const { return name_; }
-  std::string method() const { return method_; }
-
- protected:
-  // RPC endpoint.
-  std::string ep_;
-  const platform::DeviceContext* ctx_;
-  const framework::Scope* scope_;
-  // Variable name.
-  std::string name_;
-  // RPC method name.
-  std::string method_;
-
- protected:
-  std::mutex sync_mutex_;
-  std::condition_variable wait_cond_;
-
-  enum VarHandleStatus {
-    kDefaultState = -1,
-    kErrorState = 0,
-    kFinishState = 1,
-  };
-  VarHandleStatus status_;
-
- private:
-  DISABLE_COPY_AND_ASSIGN(VarHandle);
-};
-
-typedef std::shared_ptr<VarHandle> VarHandlePtr;
-
-class RequestHandler {
- public:
-  explicit RequestHandler(int distributed_mode)
-      : distributed_mode_(distributed_mode),
-        dev_ctx_(nullptr),
-        executor_(nullptr),
-        scope_(nullptr),
-        program_(nullptr),
-        rpc_server_(nullptr) {}
-
-  virtual ~RequestHandler() {}
-
-  // Set attributes.
-  void SetScope(framework::Scope* scope) { scope_ = scope; }
-  void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
-  void SetProgram(framework::ProgramDesc* program) { program_ = program; }
-  void SetExecutor(framework::Executor* executor) { executor_ = executor; }
-
-  // Used for dist lookup table prefetch
-  void SetPrefetchPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    prefetch_var_name_to_prepared_ctx_ = g;
-  }
-
-  void SetCheckpointNotifyPreparedCtx(
-      std::shared_ptr<framework::ExecutorPrepareContext> g) {
-    checkpoint_prepared_ctx_ = g;
-  }
-
-  // Used for async.
-  void SetGradToPreparedCtx(
-      std::unordered_map<
-          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
-    grad_to_prepared_ctx_ = g;
-  }
-
-  void SetSparseGradToParam(std::unordered_map<std::string, std::string>* g) {
-    sparse_grad_to_param_ = g;
-  }
-
-  void SetLrDecayPreparedCtx(
-      std::shared_ptr<framework::ExecutorPrepareContext> g) {
-    lr_decay_prepared_ctx_ = g;
-  }
-
-  void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
-
-  // Get attributes.
-  int distributed_mode() { return distributed_mode_; }
-  framework::Scope* scope() { return scope_; }
-  const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
-  framework::ProgramDesc* program() { return program_; }
-  framework::Executor* executor() { return executor_; }
-
-  // This function processes user's rpc request.
-  // The implemention is in request_handler_impl.
-  // example:
-  //    std::string varname = request_.varname();
-  //
-  //    auto scope = request_handler_->scope();
-  //    auto invar = scope->FindVar(varname);
-  //    framework::Variable* outvar = nullptr;
-  //
-  //    request_handler_->Handle(varname, scope, invar, &outvar);
-  //    if (outvar) {
-  //        SerializeToByteBuffer(varname, outvar,
-  //           *request_handler_->dev_ctx(), &reply_);
-  //    }
-  virtual bool Handle(const std::string& varname, framework::Scope* scope,
-                      framework::Variable* var, framework::Variable** outvar,
-                      const int trainer_id,
-                      const std::string& out_var_name = "",
-                      const std::string& table_name = "") = 0;
-
- protected:
-  const int distributed_mode_;
-
-  const platform::DeviceContext* dev_ctx_;
-  framework::Executor* executor_;
-  framework::Scope* scope_;
-  framework::ProgramDesc* program_;
-
-  // used for distribute lookup table prefetch
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      prefetch_var_name_to_prepared_ctx_;
-  // used for checkpoint notify
-  std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_prepared_ctx_;
-
-  // Used for async.
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>*
-      grad_to_prepared_ctx_;
-  std::unordered_map<std::string, std::string>* sparse_grad_to_param_;
-
-  // used for lr decay
-  std::shared_ptr<framework::ExecutorPrepareContext> lr_decay_prepared_ctx_;
-  RPCServer* rpc_server_;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
deleted file mode 100644
index 8c4f2ef57a32c852f2759e0fc0dcfc63f6d38578..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ /dev/null
@@ -1,354 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/string/piece.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/string/split.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables
-// to directory specified.
-constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath";
-
-bool RequestSendHandler::Handle(const std::string &varname,
-                                framework::Scope *scope,
-                                framework::Variable *invar,
-                                framework::Variable **outvar,
-                                const int trainer_id,
-                                const std::string &out_var_name,
-                                const std::string &table_name) {
-  VLOG(4) << "RequestSendHandler:" << varname;
-
-  // Sync
-  if (varname == BATCH_BARRIER_MESSAGE) {
-    VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
-    rpc_server_->IncreaseBatchBarrier(kRequestSend);
-  } else if (varname == COMPLETE_MESSAGE) {
-    VLOG(3) << "sync: recv complete message";
-
-    if (HeartBeatMonitor::GetInstance() != nullptr) {
-      HeartBeatMonitor::GetInstance()->Update(trainer_id, "", COMPLETED);
-    }
-
-    rpc_server_->Complete();
-  } else {
-    // Async
-    if (distributed_mode_ != DistributedMode::kSync) {
-      VLOG(3) << "async process var: " << varname;
-      if (varname == BATCH_BARRIER_MESSAGE) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "async mode should not recv BATCH_BARRIER_MESSAGE or "
-            "COMPLETE_MESSAGE"));
-      }
-      HeartBeatMonitor::GetInstance()->Update(trainer_id, varname, RUNNING);
-
-      std::string run_varname = varname;
-
-      string::Piece part_piece("@PIECE");
-      string::Piece var_name_piece = string::Piece(varname);
-
-      if (string::Contains(var_name_piece, part_piece)) {
-        auto varname_splits = paddle::string::Split(varname, '@');
-        PADDLE_ENFORCE_EQ(
-            varname_splits.size(), 3,
-            platform::errors::InvalidArgument(
-                "varname: %s should be separated into 3 parts by @", varname));
-        run_varname = varname_splits[0];
-        scope->Rename(varname, run_varname);
-      }
-
-      auto *var = scope->FindVar(run_varname);
-
-      // for sparse ids
-      if (var->IsType<framework::SelectedRows>()) {
-        if (distributed_mode_ == DistributedMode::kAsync ||
-            distributed_mode_ == DistributedMode::kHalfAsync) {
-          auto *ins = distributed::LargeScaleKV::GetInstance();
-          if (ins->GradInLargeScale(run_varname)) {
-            auto *large_scale_var = ins->GetByGrad(run_varname);
-
-            for (auto name : large_scale_var->CachedVarnames()) {
-              scope->Var(name);
-            }
-          }
-        }
-        if (distributed_mode_ == DistributedMode::kGeo) {
-          if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(
-                  run_varname)) {
-            auto &grad_slr =
-                scope->FindVar(run_varname)->Get<framework::SelectedRows>();
-            AsyncSparseParamUpdateRecorder::GetInstance()->Update(
-                run_varname, grad_slr.rows());
-          }
-        }
-      }
-
-      executor_->RunPreparedContext((*grad_to_prepared_ctx_)[run_varname].get(),
-                                    scope);
-      return true;
-    } else {  // sync
-      rpc_server_->WaitCond(kRequestSend);
-      VLOG(3) << "sync: processing received var: " << varname;
-      PADDLE_ENFORCE_NOT_NULL(
-          invar, platform::errors::NotFound(
-                     "sync: Can not find server side var %s.", varname));
-    }
-  }
-  return true;
-}
-
-bool RequestGetHandler::Handle(const std::string &varname,
-                               framework::Scope *scope,
-                               framework::Variable *invar,
-                               framework::Variable **outvar,
-                               const int trainer_id,
-                               const std::string &out_var_name,
-                               const std::string &table_name) {
-  VLOG(3) << "RequestGetHandler:" << varname
-          << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id
-          << " table_name: " << table_name;
-
-  if (distributed_mode_ == DistributedMode::kSync) {
-    if (varname == FETCH_BARRIER_MESSAGE) {
-      VLOG(3) << "sync: recv fetch barrier message";
-      rpc_server_->IncreaseBatchBarrier(kRequestGet);
-    } else {
-      rpc_server_->WaitCond(kRequestGet);
-      *outvar = scope_->FindVar(varname);
-    }
-  } else {
-    if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) {
-      if (enable_dc_asgd_) {
-        // NOTE: the format is determined by distribute_transpiler.py
-        std::string param_bak_name =
-            string::Sprintf("%s.trainer_%d_bak", varname, trainer_id);
-        VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id;
-        auto var = scope_->FindVar(varname);
-        auto t_orig = var->Get<framework::LoDTensor>();
-        auto param_bak = scope_->Var(param_bak_name);
-        auto t = param_bak->GetMutable<framework::LoDTensor>();
-        t->mutable_data(dev_ctx_->GetPlace(), t_orig.type());
-        VLOG(3) << "copying " << varname << " to " << param_bak_name;
-        framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
-      }
-
-      if (distributed_mode_ == DistributedMode::kGeo &&
-          AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
-          !table_name.empty()) {
-        VLOG(3) << "AsyncSparseParamUpdateRecorder " << varname << " exist ";
-
-        std::vector<int64_t> updated_rows;
-        AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
-            varname, trainer_id, &updated_rows);
-
-        if (VLOG_IS_ON(3)) {
-          std::ostringstream sstream;
-          sstream << "[";
-          for (auto &row_id : updated_rows) {
-            sstream << row_id << ", ";
-          }
-          sstream << "]";
-          VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
-                  << sstream.str();
-        }
-
-        auto &origin_tensor =
-            scope_->FindVar(varname)->Get<framework::LoDTensor>();
-        auto *origin_tensor_data = origin_tensor.data<float>();
-        auto &dims = origin_tensor.dims();
-        *outvar = scope->Var();
-        auto *out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
-        out_slr->set_rows(updated_rows);
-        out_slr->set_height(dims[0]);
-        auto out_dims = framework::make_ddim(
-            {static_cast<int64_t>(updated_rows.size()), dims[1]});
-        auto *data = out_slr->mutable_value()->mutable_data<float>(
-            out_dims, origin_tensor.place());
-        auto width = dims[1];
-        for (size_t i = 0; i < updated_rows.size(); ++i) {
-          PADDLE_ENFORCE_LT(
-              updated_rows[i], dims[0],
-              platform::errors::OutOfRange(
-                  "The value of updated_rows: %s out of Tensor %s dims[0]: %s",
-                  updated_rows[i], varname, dims[0]));
-          memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
-                 sizeof(float) * width);
-        }
-      } else {
-        *outvar = scope_->FindVar(varname);
-      }
-    }
-  }
-  return true;
-}
-
-bool RequestGetNoBarrierHandler::Handle(const std::string &varname,
-                                        framework::Scope *scope,
-                                        framework::Variable *invar,
-                                        framework::Variable **outvar,
-                                        const int trainer_id,
-                                        const std::string &out_var_name,
-                                        const std::string &table_name) {
-  VLOG(4) << "RequestGetNoBarrierHandler:" << varname
-          << " out_var_name: " << out_var_name;
-
-  // get var from pserver immediately without barriers
-  string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE);
-  string::Piece var_name_piece = string::Piece(varname);
-
-  if (string::Contains(var_name_piece, without_barrier_piece)) {
-    var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece);
-    VLOG(4) << "Get var " << var_name_piece << " with "
-            << WITHOUT_BARRIER_MESSAGE;
-    *outvar = scope_->FindVar(var_name_piece.ToString());
-    return true;
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE));
-  }
-  return true;
-}
-
-bool RequestPrefetchHandler::Handle(const std::string &varname,
-                                    framework::Scope *scope,
-                                    framework::Variable *invar,
-                                    framework::Variable **outvar,
-                                    const int trainer_id,
-                                    const std::string &out_var_name,
-                                    const std::string &table_name) {
-  VLOG(4) << "RequestPrefetchHandler " << varname;
-
-  (*outvar)->GetMutable<framework::LoDTensor>();
-
-  VLOG(1) << "Prefetch "
-          << "tablename: " << table_name << " ids:" << varname
-          << " out: " << out_var_name;
-  paddle::platform::CPUPlace cpu_place;
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-
-  if (ins->ParamInLargeScale(table_name)) {
-    auto lookup_table_op = PullLargeScaleOp(table_name, varname, out_var_name);
-    lookup_table_op->Run(*scope, cpu_place);
-  } else {
-    auto lookup_table_op =
-        BuildLookupTableOp(table_name, varname, out_var_name);
-    lookup_table_op->Run(*scope, cpu_place);
-  }
-
-  return true;
-}
-
-bool RequestCheckpointHandler::Handle(const std::string &varname,
-                                      framework::Scope *scope,
-                                      framework::Variable *invar,
-                                      framework::Variable **outvar,
-                                      const int trainer_id,
-                                      const std::string &out_var_name,
-                                      const std::string &table_name) {
-  VLOG(4) << "receive save var " << varname << " with path " << out_var_name
-          << " mode " << table_name;
-
-  int mode = std::stoi(table_name);
-
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get(varname)->Save(out_var_name, mode);
-  return true;
-}
-
-bool RequestNotifyHandler::Handle(const std::string &varname,
-                                  framework::Scope *scope,
-                                  framework::Variable *invar,
-                                  framework::Variable **outvar,
-                                  const int trainer_id,
-                                  const std::string &out_var_name,
-                                  const std::string &table_name) {
-  VLOG(3) << "RequestNotifyHandler: " << varname
-          << ", trainer_id: " << trainer_id;
-
-  string::Piece decay_piece(STEP_COUNTER);
-  string::Piece var_name_piece = string::Piece(varname);
-  if (string::Contains(var_name_piece, decay_piece)) {
-    VLOG(3) << "LearningRate Decay Counter Update";
-
-    auto *send_var = scope->FindVar(varname);
-    auto send_var_tensor = send_var->Get<framework::LoDTensor>();
-    auto *send_value =
-        send_var_tensor.mutable_data<int64_t>(send_var_tensor.place());
-
-    auto counter = decay_counters.at(trainer_id);
-    counter += send_value[0];
-    decay_counters.at(trainer_id) = counter;
-
-    auto *global_step_var = this->scope()->FindVar(LEARNING_RATE_DECAY_COUNTER);
-    if (global_step_var == nullptr) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not find LEARNING_RATE_DECAY_COUNTER "));
-    }
-
-    auto *tensor = global_step_var->GetMutable<framework::LoDTensor>();
-    auto *value = tensor->mutable_data<int64_t>(platform::CPUPlace());
-
-    auto global_counter = 0;
-    for (auto &trainer_counter : decay_counters) {
-      global_counter += trainer_counter.second;
-    }
-    value[0] = global_counter;
-
-    if (lr_decay_prepared_ctx_.get() == nullptr) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "can not find decay block for executor"));
-    }
-
-    executor_->RunPreparedContext(lr_decay_prepared_ctx_.get(), scope_);
-  }
-  return true;
-}
-
-bool RequestSendAndRecvHandler::Handle(const std::string &varname,
-                                       framework::Scope *Scope,
-                                       framework::Variable *var,
-                                       framework::Variable **outvar,
-                                       const int trainer_id,
-                                       const std::string &out_var_name,
-                                       const std::string &table_name) {
-  VLOG(3) << "SendAndRecvHandle: " << varname
-          << " out_var_name: " << out_var_name
-          << " , trainer_id:  " << trainer_id;
-
-  executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope);
-  *outvar = Scope->FindVar(out_var_name);
-  return true;
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h
deleted file mode 100644
index 6d239673f9104131c3129ea822e5c9f892845ea1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/request_handler_impl.h
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <time.h>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-class Variable;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestSendHandler final : public RequestHandler {
- public:
-  explicit RequestSendHandler(int distributed_mode, bool enable_dc_asgd = false)
-      : RequestHandler(distributed_mode) {
-    enable_dc_asgd_ = enable_dc_asgd;
-  }
-  virtual ~RequestSendHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  bool enable_dc_asgd_;
-};
-
-class RequestGetHandler final : public RequestHandler {
- public:
-  explicit RequestGetHandler(int distributed_mode, bool enable_dc_asgd = false)
-      : RequestHandler(distributed_mode) {
-    enable_dc_asgd_ = enable_dc_asgd;
-  }
-  virtual ~RequestGetHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  bool enable_dc_asgd_;
-};
-
-class RequestGetNoBarrierHandler final : public RequestHandler {
- public:
-  RequestGetNoBarrierHandler() : RequestHandler(false) {}
-  virtual ~RequestGetNoBarrierHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-};
-
-static inline void BuildVar(const std::string& param_name,
-                            std::initializer_list<const char*> arguments,
-                            paddle::framework::proto::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    *var->mutable_arguments()->Add() = arg_name;
-  }
-}
-
-class RequestPrefetchHandler final : public RequestHandler {
- public:
-  explicit RequestPrefetchHandler(int distributed_mode)
-      : RequestHandler(distributed_mode) {}
-  virtual ~RequestPrefetchHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  std::unique_ptr<paddle::framework::OperatorBase> PullLargeScaleOp(
-      const std::string& table_name, const std::string& id_name,
-      const std::string& out_name) {
-    framework::OpDesc desc;
-    desc.SetType("lookup_sparse_table_read");
-    desc.SetInput("Ids", {id_name});
-    desc.SetOutput("Out", std::vector<std::string>({out_name}));
-    desc.SetAttr("tablename", {table_name});
-    desc.SetAttr("init", true);
-    desc.SetAttr("value_names", std::vector<std::string>({"Param"}));
-
-    auto op = paddle::framework::OpRegistry::CreateOp(desc);
-    return op;
-  }
-
-  std::unique_ptr<paddle::framework::OperatorBase> BuildLookupTableOp(
-      const std::string& table_name, const std::string& id_name,
-      const std::string& out_name) {
-    paddle::framework::proto::OpDesc op_desc;
-    op_desc.set_type("lookup_table");
-    BuildVar("W", {table_name.data()}, op_desc.add_inputs());
-    BuildVar("Ids", {id_name.data()}, op_desc.add_inputs());
-    BuildVar("Out", {out_name.data()}, op_desc.add_outputs());
-
-    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    return op;
-  }
-};
-
-class RequestCheckpointHandler final : public RequestHandler {
- public:
-  explicit RequestCheckpointHandler(int distributed_mode)
-      : RequestHandler(distributed_mode) {}
-
-  virtual ~RequestCheckpointHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  std::unique_ptr<paddle::framework::OperatorBase> BuildCheckpointOp(
-      const std::string& varname, const std::string& file_path) {
-    paddle::framework::proto::OpDesc op_desc;
-    op_desc.set_type("save");
-    BuildVar("X", {varname.data()}, op_desc.add_inputs());
-
-    auto attr = op_desc.mutable_attrs()->Add();
-    attr->set_name("file_path");
-    attr->set_type(paddle::framework::proto::AttrType::STRING);
-    attr->set_s(file_path);
-
-    auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    return op;
-  }
-};
-
-class RequestNotifyHandler final : public RequestHandler {
- public:
-  explicit RequestNotifyHandler(int distributed_mode, int trainers)
-      : RequestHandler(distributed_mode) {
-    this->trainers = trainers;
-    for (int i = 0; i < trainers; i++) {
-      decay_counters[i] = 0;
-    }
-  }
-  virtual ~RequestNotifyHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-
- private:
-  int trainers;
-  std::unordered_map<int, int64_t> decay_counters;
-};
-
-class RequestSendAndRecvHandler final : public RequestHandler {
- public:
-  explicit RequestSendAndRecvHandler(int distributed_mode)
-      : RequestHandler(distributed_mode) {}
-  virtual ~RequestSendAndRecvHandler() {}
-  bool Handle(const std::string& varname, framework::Scope* Scope,
-              framework::Variable* var, framework::Variable** outvar,
-              const int trainer_id, const std::string& out_var_name = "",
-              const std::string& table_name = "") override;
-};
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc
deleted file mode 100644
index 57ce54870decf2d56c321efbaddbc108fb113ea7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "gflags/gflags.h"
-
-// default to 3min to avoid temprary network failures.
-DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
-DEFINE_int32(rpc_retry_times, 3, "retry times for rpc");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-std::once_flag RPCClient::init_flag_;
-std::unique_ptr<RPCClient> RPCClient::rpc_client_(nullptr);
-int RPCClient::trainer_id_ = 0;
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
deleted file mode 100644
index 2c756a6f71ff94e11be634f69d4a2f8e9174d716..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <condition_variable>  // NOLINT
-#include <memory>
-#include <string>
-
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-DECLARE_int32(rpc_deadline);
-DECLARE_int32(rpc_retry_times);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RPCClient {
- public:
-  RPCClient() {}
-  virtual ~RPCClient() {}
-  virtual VarHandlePtr AsyncSendVar(const std::string& ep,
-                                    const platform::DeviceContext& ctx,
-                                    const framework::Scope& scope,
-                                    const std::string& var_name,
-                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetVar(const std::string& ep,
-                                   const platform::DeviceContext& ctx,
-                                   const framework::Scope& scope,
-                                   const std::string& var_name,
-                                   const std::string& out_varname,
-                                   const std::string& table_name = "",
-                                   int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetVarNoBarrier(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      const std::string& out_varname,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetMonomerVariable(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncPrefetchVar(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& in_var_name,
-      const std::string& out_var_name, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendBatchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendFetchBarrier(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncGetMonomerBarrier(
-      const std::string& ep, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncCheckpointNotify(
-      const std::string& ep, const std::string& dirname,
-      const std::string& varname, const int mode,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncDistributeNotify(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& var_name,
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendAndRecv(
-      const std::string& ep, const platform::DeviceContext& ctx,
-      const framework::Scope& scope, const std::string& send_var_name,
-      const std::string& recv_var_name, const std::string& table_name = "",
-      int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  virtual VarHandlePtr AsyncSendComplete(
-      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0;
-
-  // Complete tells all the pserver instances that finishe the training,
-  // the pserver can reduce it's barrier count, and continue to train
-  // with other trainers.
-  virtual void SendComplete() = 0;
-
-  virtual bool Wait() = 0;
-
-  template <typename T>
-  static RPCClient* GetInstance(int trainer_id) {
-    std::call_once(init_flag_, &RPCClient::Init<T>, trainer_id);
-    return rpc_client_.get();
-  }
-
-  // Init is called by GetInstance.
-  template <typename T>
-  static void Init(int trainer_id) {
-    VLOG(1) << "init rpc client with trainer_id " << trainer_id;
-    trainer_id_ = trainer_id;
-    if (rpc_client_.get() == nullptr) {
-      rpc_client_.reset(new T());
-      rpc_client_->InitImpl();
-    }
-  }
-
-  virtual void InitImpl() {}
-
- protected:
-  // each trainer have exact one trainer id, it should be static
-  static int trainer_id_;
-
- private:
-  static std::once_flag init_flag_;
-  static std::unique_ptr<RPCClient> rpc_client_;
-};
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
deleted file mode 100644
index 37cf0460fb1fa11cb2a7de428463c20c145cb1b5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-#include <fstream>
-#include <string>
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestHandler;
-
-void RPCServer::ShutDown() {
-  VLOG(3) << "RPCServer ShutDown ";
-  ShutDownImpl();
-
-  exit_flag_ = true;
-  barrier_cond_.notify_all();
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::SavePort() const {
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  std::ofstream port_file;
-  port_file.open(file_path);
-  port_file << selected_port_;
-  port_file.close();
-  VLOG(3) << "selected port written to " << file_path;
-}
-
-void RPCServer::WaitBarrier(const std::string& rpc_name) {
-  VLOG(3) << "WaitBarrier in: " << rpc_name;
-  std::unique_lock<std::mutex> lock(this->mutex_);
-  barrier_cond_.wait(lock, [this, &rpc_name] {
-    return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) ||
-            exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitBarrier out: " << rpc_name
-          << " counter: " << barrier_counter_[rpc_name];
-}
-
-void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) {
-  VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name;
-  // barrier msg should make sure that it's in the right cond(send|recv)
-  WaitCond(rpc_name);
-  int b = 0;
-  std::unique_lock<std::mutex> lock(mutex_);
-  b = ++barrier_counter_[rpc_name];
-  VLOG(3) << rpc_name << " barrier_counter: " << b;
-  if (b >= client_num_) {
-    lock.unlock();
-    VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for "
-            << rpc_name;
-    barrier_cond_.notify_all();
-    lock.lock();
-  }
-}
-
-void RPCServer::Complete() {
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    client_num_--;
-    need_reset_all_vars_ = true;
-
-    VLOG(3) << "decrease client_num to: " << client_num_;
-    if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) {
-      barrier_counter_[kRequestGet]--;
-    }
-  }
-  barrier_cond_.notify_all();
-}
-
-bool RPCServer::NeedResetAllVars() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return need_reset_all_vars_;
-}
-
-int RPCServer::GetClientNum() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  return client_num_;
-}
-
-void RPCServer::ResetBarrierCounter() {
-  VLOG(3) << "RPCServer ResetBarrierCounter ";
-  std::unique_lock<std::mutex> lock(mutex_);
-  for (auto& t : barrier_counter_) {
-    t.second = 0;
-  }
-  need_reset_all_vars_ = false;
-}
-
-void RPCServer::RegisterRPC(const std::string& rpc_name,
-                            RequestHandler* handler, int thread_num) {
-  rpc_call_map_[rpc_name] = handler;
-  rpc_thread_num_[rpc_name] = thread_num;
-
-  static int cond = -1;
-  rpc_cond_map_[rpc_name] = ++cond;
-  VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler
-          << ", cond: " << rpc_cond_map_[rpc_name];
-}
-
-void RPCServer::SetCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer SetCond " << rpc_name;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cur_cond_ = rpc_cond_map_[rpc_name];
-  }
-
-  rpc_cond_.notify_all();
-}
-
-void RPCServer::WaitCond(const std::string& rpc_name) {
-  VLOG(3) << "RPCServer WaitCond in " << rpc_name;
-  int cond = 0;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cond = rpc_cond_map_[rpc_name];
-  }
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  rpc_cond_.wait(
-      lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); });
-  VLOG(3) << "RPCServer WaitCond out " << rpc_name;
-}
-
-void RPCServer::RegisterVar(const std::string& var_name,
-                            const std::string& rpc_name,
-                            framework::Scope* scope,
-                            platform::DeviceContext* dev_ctx) {
-  MonomerHandle h;
-  h.var_name_ = var_name;
-  h.rpc_name_ = rpc_name;
-  h.scope_ = scope;
-  h.dev_ctx_ = dev_ctx;
-
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    PADDLE_ENFORCE_EQ(
-        var_map_.find(var_name), var_map_.end(),
-        platform::errors::AlreadyExists("%s already in var_map.", var_name));
-    var_map_[var_name] = h;
-  }
-
-  rpc_cond_.notify_all();
-  VLOG(3) << "RegisterVar context:" << h.String();
-}
-
-void RPCServer::IncreaseVarBarrier(const std::string& var_name) {
-  int b = 0;
-  MonomerHandle h;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    b = ++var_map_[var_name].barrier_;
-    h = var_map_[var_name];
-  }
-
-  if (b >= client_num_) {
-    barrier_cond_.notify_all();
-  }
-
-  VLOG(3) << "IncreaseVarBarrier context:" << h.String();
-}
-
-void RPCServer::WaitVarBarrier(const std::string& var_name) {
-  VLOG(3) << "WaitVarBarrier var_name:" << var_name;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  barrier_cond_.wait(lock, [&]() {
-    return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) ||
-            exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String();
-}
-
-void RPCServer::SetVarCond(const std::string& var_name) {
-  VLOG(3) << "SetVarCond var_name:" << var_name;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    if (var_map_.find(var_name) != var_map_.end()) {
-      rpc_cond_.notify_all();
-    }
-  }
-}
-
-void RPCServer::WaitVarCond(const std::string& var_name) {
-  VLOG(3) << "WaitVarCond var_name:" << var_name;
-
-  std::unique_lock<std::mutex> lock(mutex_);
-  rpc_cond_.wait(lock, [=] {
-    return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load());
-  });
-
-  VLOG(3) << "WaitVarCond var_name:" << var_name << " end";
-}
-
-MonomerHandle RPCServer::GetMonomer(const std::string& var_name) {
-  MonomerHandle h;
-  {
-    std::unique_lock<std::mutex> lock(mutex_);
-    h = var_map_[var_name];
-  }
-
-  return h;
-}
-
-void RPCServer::ClearRegisteredVars() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  var_map_.clear();
-}
-
-void RPCServer::ClearVar(const std::string& var_name) {
-  std::unique_lock<std::mutex> lock(mutex_);
-  var_map_.erase(var_name);
-}
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
deleted file mode 100644
index 2120260515e2556046358e62592214c9f648533b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <set>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-class RequestHandler;
-
-struct MonomerHandle {
-  std::string var_name_;
-  std::string rpc_name_;
-  framework::Scope* scope_{nullptr};
-  platform::DeviceContext* dev_ctx_{nullptr};
-  int64_t barrier_{0};
-
-  std::string String() {
-    std::stringstream ss;
-    ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_
-       << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_
-       << ", barrier_:" << barrier_;
-    return ss.str();
-  }
-};
-
-class RPCServer {
- public:
-  explicit RPCServer(const std::string& address, int client_num)
-      : cur_cond_(0),
-        bind_address_(address),
-        exit_flag_(false),
-        selected_port_(0),
-        client_num_(client_num),
-        need_reset_all_vars_(false) {}
-
-  virtual ~RPCServer() {}
-  virtual void StartServer() = 0;
-  virtual void WaitServerReady() = 0;
-
-  void ShutDown();
-
-  bool IsExit() { return exit_flag_.load(); }
-
-  int GetSelectedPort() const { return selected_port_; }
-
-  int GetClientNum();
-
-  void SavePort() const;
-
-  // RegisterRPC, register the rpc method name to a handler
-  // class, and auto generate a condition id for this call
-  // to be used for the barrier.
-  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
-                   int thread_num = 1);
-
-  int GetThreadNum(const std::string& rpc_name) {
-    return rpc_thread_num_[rpc_name];
-  }
-
-  // Wait util all the clients have reached the barrier for one
-  // rpc method. This function should be called in the
-  // RequestHandler if you want to run the server/client in a
-  // synchronous mode.
-  void WaitBarrier(const std::string& rpc_name);
-
-  void SetCond(const std::string& rpc_name);
-  void WaitCond(const std::string& rpc_name);
-  void IncreaseBatchBarrier(const std::string rpc_name);
-
-  void RegisterVar(const std::string& var_name, const std::string& rpc_name,
-                   framework::Scope* scope, platform::DeviceContext* dev_ctx);
-  void IncreaseVarBarrier(const std::string& var_name);
-  void WaitVarBarrier(const std::string& var_name);
-  void SetVarCond(const std::string& var_name);
-  void WaitVarCond(const std::string& var_name);
-  void ClearRegisteredVars();
-  void ClearVar(const std::string& var_name);
-  MonomerHandle GetMonomer(const std::string& var_name);
-
-  void Complete();
-
-  void ResetBarrierCounter();
-
-  bool NeedResetAllVars();
-
- protected:
-  virtual void ShutDownImpl() = 0;
-
- private:
-  std::mutex mutex_;
-  std::unordered_map<std::string, int> barrier_counter_;
-  std::condition_variable barrier_cond_;
-
-  std::unordered_map<std::string, int> rpc_cond_map_;
-  std::atomic<int> cur_cond_;
-  std::condition_variable rpc_cond_;
-
- protected:
-  std::string bind_address_;
-  std::atomic<int> exit_flag_;
-  int selected_port_;
-  int client_num_;
-  bool need_reset_all_vars_;
-
-  std::unordered_map<std::string, RequestHandler*> rpc_call_map_;
-  std::unordered_map<std::string, int> rpc_thread_num_;
-  friend class RequestHandler;
-
-  // TODO(gongwb): use more cond to notify or wait;
-  std::unordered_map<std::string, MonomerHandle> var_map_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc
deleted file mode 100644
index f59285400033df2726fe519fe76dd63bd8c504d5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ /dev/null
@@ -1,344 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <chrono>  // NOLINT
-#include <memory>
-#include <string>
-#include <thread>  // NOLINT
-#include <unordered_map>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-
-namespace framework = paddle::framework;
-namespace platform = paddle::platform;
-namespace distributed = paddle::operators::distributed;
-
-USE_NO_KERNEL_OP(lookup_sparse_table_read);
-USE_NO_KERNEL_OP(checkpoint_notify);
-USE_OP(scale);
-
-std::unique_ptr<distributed::RPCServer> g_rpc_service;
-std::unique_ptr<distributed::RequestHandler> g_req_handler;
-
-framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
-  auto root_block = program->MutableBlock(0);
-  auto* block = program->AppendBlock(*root_block);
-
-  framework::OpDesc* op = block->AppendOp();
-  op->SetType("scale");
-  op->SetInput("X", {"x"});
-  op->SetOutput("Out", {"res"});
-  op->SetAttr("scale", 0.5f);
-
-  auto& out = *root_block->Var("res");
-  out.SetType(framework::proto::VarType::LOD_TENSOR);
-  out.SetShape({1, 10});
-
-  return block;
-}
-
-void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) {
-  auto w_var = scope->Var("w");
-  w_var->GetMutable<framework::SelectedRows>();
-
-  auto out_var = scope->Var("out");
-  out_var->GetMutable<framework::LoDTensor>();
-
-  auto ids_var = scope->Var("ids");
-  ids_var->GetMutable<framework::LoDTensor>();
-
-  auto x_var = scope->Var("x");
-  x_var->GetMutable<framework::LoDTensor>();
-
-  auto res_var = scope->Var("res");
-  res_var->GetMutable<framework::LoDTensor>();
-}
-
-void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto ids_var = scope->Var("ids")->GetMutable<framework::LoDTensor>();
-  int64_t* ids_ptr =
-      ids_var->mutable_data<int64_t>(framework::DDim({rows_numel, 1}), *place);
-  for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2;
-
-  auto x_var = scope->Var("x")->GetMutable<framework::LoDTensor>();
-  float* x_ptr =
-      x_var->mutable_data<float>(framework::DDim({1, rows_numel}), *place);
-  for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0;
-}
-
-void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
-                         int64_t rows_numel) {
-  CreateVarsOnScope(scope, place);
-  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto w_value = w->mutable_value();
-  w_value->Resize({rows_numel, 10});
-  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
-
-  auto ptr = w_value->mutable_data<float>(*place);
-
-  for (int64_t i = 0; i < w_value->numel(); ++i) {
-    ptr[i] = static_cast<float>(i / 10);
-  }
-}
-
-void StartServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared;
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-
-  //  distributed::HeartBeatMonitor::Init(1, true, "w@grad");
-
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-void StartSendAndRecvServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-  auto block = AppendSendAndRecvBlock(&program);
-  std::string in_var_name("x");
-  std::vector<int> prefetch_block_ids{block->ID()};
-  auto prepared = exe.Prepare(program, prefetch_block_ids);
-  InitTensorsOnServer(&scope, &place, 10);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      grad_to_prepared_ctx;
-  grad_to_prepared_ctx[in_var_name] = prepared[0];
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-TEST(COMPLETE, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-  g_req_handler.reset(
-      new distributed::RequestSendHandler(distributed::DistributedMode::kSync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2));
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  PADDLE_ENFORCE_NE(client, nullptr,
-                    platform::errors::InvalidArgument(
-                        "Client Start Fail, Check Your Code & Env"));
-  std::thread server_thread(StartServer, distributed::kRequestSend);
-  g_rpc_service->WaitServerReady();
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-  client->AsyncSendComplete(ep);
-  client->Wait();
-
-  EXPECT_EQ(g_rpc_service->GetClientNum(), 1);
-
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
-
-TEST(SENDANDRECV, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-  g_req_handler.reset(new distributed::RequestSendAndRecvHandler(
-      distributed::DistributedMode::kAsync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-  PADDLE_ENFORCE_NE(client, nullptr,
-                    platform::errors::InvalidArgument(
-                        "Client Start Fail, Check Your Code & Env"));
-  std::thread server_thread(StartSendAndRecvServer,
-                            distributed::kRequestSendAndRecv);
-  g_rpc_service->WaitServerReady();
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-
-  framework::Scope scope;
-  platform::CPUPlace place;
-  platform::CPUDeviceContext ctx(place);
-
-  // create var on local scope
-  int64_t rows_numel = 10;
-  InitTensorsOnClient(&scope, &place, rows_numel);
-  std::string in_var_name("x");
-  std::string out_var_name("res");
-
-  client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name);
-  client->Wait();
-  auto var = scope.Var(out_var_name);
-  auto value = var->GetMutable<framework::LoDTensor>();
-  auto ptr = value->mutable_data<float>(place);
-
-  for (int64_t i = 0; i < rows_numel; ++i) {
-    EXPECT_EQ(ptr[i], 0.5);
-  }
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  LOG(INFO) << "begin reset";
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
-
-void StartCheckpointServer(const std::string& rpc_name) {
-  framework::ProgramDesc program;
-  framework::Scope scope;
-  platform::CPUPlace place;
-  framework::Executor exe(place);
-  platform::CPUDeviceContext ctx(place);
-
-  std::vector<distributed::SparseMeta> metas;
-
-  auto meta = distributed::SparseMeta();
-  meta.name = "embedding.block0";
-  meta.value_names = {"Param"};
-  meta.value_dims = {64};
-  meta.mode = distributed::Mode::training;
-  meta.grad_name = "embedding@Grad";
-  meta.cached_varnames = {"kSparseIds"};
-  meta.initializer_attrs = {"fill_constant&1.0"};
-  meta.entry = "none";
-
-  metas.push_back(meta);
-  distributed::LargeScaleKV::Init(metas);
-
-  auto* ins = distributed::LargeScaleKV::GetInstance();
-  ins->Get("embedding.block0")->Init({0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared;
-
-  g_req_handler->SetProgram(&program);
-  g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared);
-  g_req_handler->SetDevCtx(&ctx);
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetExecutor(&exe);
-
-  g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get());
-
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  server_thread.join();
-}
-
-TEST(LARGE_SCALE_CHECKPOINT, CPU) {
-  setenv("http_proxy", "", 1);
-  setenv("https_proxy", "", 1);
-
-  paddle::framework::Scope scope;
-  paddle::platform::CPUPlace place;
-
-  g_req_handler.reset(new distributed::RequestCheckpointHandler(
-      distributed::DistributedMode::kAsync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-  PADDLE_ENFORCE_NE(client, nullptr,
-                    platform::errors::InvalidArgument(
-                        "Client Start Fail, Check Your Code & Env"));
-
-  std::thread server_thread(StartCheckpointServer,
-                            distributed::kRequestCheckpoint);
-  g_rpc_service->WaitServerReady();
-
-  int port = g_rpc_service->GetSelectedPort();
-  std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port);
-
-  auto save_path =
-      paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/base",
-                              "embedding", "embedding.block0");
-  int mode = 0;
-  client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode);
-  client->Wait();
-
-  save_path =
-      paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/delta",
-                              "embedding", "embedding.block0");
-  mode = 1;
-  client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode);
-  client->Wait();
-
-  paddle::framework::AttributeMap attrs;
-
-  std::vector<std::string> eps = {ep};
-  attrs["endpoints"] = eps;
-  attrs["dirname"] = std::string("/tmp/large_scale_table/delta1");
-  attrs["varname"] = std::string("embedding");
-  attrs["mode"] = 2;
-  std::vector<std::string> slices = {"embedding.block0"};
-  attrs["slice_varnames"] = slices;
-  std::vector<std::string> remotes = {"embedding.block0"};
-  attrs["remote_varnames"] = remotes;
-
-  auto ops =
-      framework::OpRegistry::CreateOp("checkpoint_notify", {}, {}, attrs, true);
-  ops->Run(scope, place);
-
-  g_rpc_service->ShutDown();
-  server_thread.join();
-  LOG(INFO) << "begin reset";
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in
deleted file mode 100644
index a333642bd16fbfbe648a835101d67218bf473cdb..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/send_recv.proto.in
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
-the Apache License, Version 2.0 (the "License"); you may not use this file
-except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto3";
-package sendrecv;
-
-option cc_generic_services = @cc_generic_services@;
-
-service SendRecvService {
-  // For parameter server round-robin like hashing, do not split tensors.
-  // Send and recv only one tensor
-  // TODO(typhoonzero): add streaming API
-  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
-  // Argument VariableMessage for GetVariable should only contain varname.
-  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {}
-  // pre-fetch variable by given variable name and Ids
-  rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {}
-
-  rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {}
-  rpc DistributeNotify(VariableMessage) returns (VoidMessage) {}
-  rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {}
-  rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {}
-}
-
-// It can be: LoDTensor、SelectedRows or NCCL_ID
-enum VarType {
-  LOD_TENSOR = 0;
-  SELECTED_ROWS = 1;
-  NCCL_ID = 2;
-}
-
-// VariableMessage is serialized paddle variable message.
-// NOTICE(gongwb):don't modify this proto if you are not
-//   not familar with how we serialize in sendrecvop_utils.h
-//   and deserilize it in  variable_response.h.
-message VariableMessage {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-  }
-
-  message LodData { repeated int64 lod_data = 1; }
-  string varname = 1;
-  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
-  VarType type = 2;
-  // bool persistable is not needed for sending.
-  // tensor info:
-  Type data_type = 3;
-  repeated int64 dims = 4;
-
-  // lod details:
-  int64 lod_level = 5;
-  repeated LodData lod = 6;
-  // selected_rows height, aka. original dim0
-  int64 slr_height = 7;
-  // tensor data
-  bytes serialized = 8;
-  // selected_rows data
-  bytes rows = 9;
-  // Look up table block execution output variable name.
-  string out_varname = 10;
-  // If 1, the ps server will start profiling, the ps
-  // server stops profiling and generates a profile to /tmp/profile_ps_*
-  // when profile switches from 1 to 2.
-  int64 profile = 11;
-  int64 trainer_id = 12;
-  string table_name = 13;
-}
-
-message VoidMessage {}
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
deleted file mode 100644
index 107c74eb2670e4c83184167e7758119e95e72cf9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
-DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not.");
-DEFINE_int32(rpc_retry_bind_port, 3,
-             "Retry to bind the address if address is already used.");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using VarMsg = sendrecv::VariableMessage;
-
-static TensorPayload GetCommunicationAllocationFromTensor(
-    const platform::DeviceContext& ctx, const framework::Tensor& tensor) {
-  if (is_gpu_place(ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    PADDLE_ENFORCE_EQ(
-        is_gpu_place(tensor.place()), true,
-        platform::errors::PreconditionNotMet("Please run in gpu place."));
-    auto& gpu_dev_ctx =
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx);
-    auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
-    platform::CUDAPinnedPlace cuda_pinned;
-    auto result = memory::AllocShared(cuda_pinned, copy_size);
-
-    memory::Copy(cuda_pinned, result->ptr(),
-                 BOOST_GET_CONST(platform::CUDAPlace, tensor.place()),
-                 tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
-    ctx.Wait();
-    return TensorPayload(result);
-#else
-    PADDLE_THROW(
-        platform::errors::Unavailable("This situation should not be happened"));
-#endif
-  } else {
-    return TensorPayload(tensor);
-  }
-}
-TensorPayload GetTensorPayload(framework::Variable* var,
-                               const platform::DeviceContext& ctx,
-                               VarMsg* request) {
-  auto tensor = var->Get<framework::LoDTensor>();
-  // FIXME(wuyi): data types in send_recv.proto is copied from
-  // framework.proto
-  request->set_data_type(static_cast<VarMsg::Type>(tensor.type()));
-  for (auto& dim : framework::vectorize(tensor.dims())) {
-    request->add_dims(dim);
-  }
-  const framework::LoD lod = tensor.lod();
-  if (lod.size() > 0) {
-    request->set_lod_level(lod.size());
-    for (auto& each : lod) {
-      VarMsg::LodData* lod_inner = request->add_lod();
-      for (auto& d : each) {
-        lod_inner->add_lod_data(d);
-      }
-    }
-  }
-  return GetCommunicationAllocationFromTensor(ctx, tensor);
-}
-
-TensorPayload GetSelectedRowsPayload(framework::Variable* var,
-                                     const platform::DeviceContext& ctx,
-                                     VarMsg* request) {
-  auto* slr = var->GetMutable<framework::SelectedRows>();
-  request->set_data_type(static_cast<VarMsg::Type>(slr->value().type()));
-  request->set_lod_level(0);
-  request->set_slr_height(slr->height());
-
-  for (auto& dim : framework::vectorize(slr->value().dims())) {
-    request->add_dims(dim);
-  }
-
-  auto* tensor = slr->mutable_value();
-  return GetCommunicationAllocationFromTensor(ctx, *tensor);
-}
-
-TensorPayload::TensorPayload(std::shared_ptr<memory::Allocation> allocation)
-    : allocation_(allocation), offset_(0), memory_size_(allocation->size()) {}
-TensorPayload::TensorPayload(const framework::Tensor& tensor)
-    : allocation_(tensor.Holder()),
-      offset_(tensor.offset()),
-      memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {}
-void* TensorPayload::ptr() const {
-  return reinterpret_cast<void*>(
-      reinterpret_cast<uintptr_t>(allocation_->ptr()) + offset_);
-}
-size_t TensorPayload::memory_size() const { return memory_size_; }
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h
deleted file mode 100644
index 84ed1ab0247124bfd494810400fea4d108acf164..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <iostream>
-#include <memory>
-#include <string>
-#include <typeindex>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-#include "paddle/fluid/platform/port.h"
-
-namespace paddle {
-namespace framework {
-class Tensor;
-class Variable;
-}  // namespace framework
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-using VarMsg = sendrecv::VariableMessage;
-
-class TensorPayload final {
- public:
-  explicit TensorPayload(const framework::Tensor& tensor);
-  explicit TensorPayload(std::shared_ptr<memory::Allocation> allocation);
-
-  TensorPayload(const TensorPayload& o) = default;
-  TensorPayload& operator=(const TensorPayload& o) = default;
-
-  void* ptr() const;
-  size_t memory_size() const;
-
- private:
-  std::shared_ptr<memory::Allocation> allocation_;
-  size_t offset_;
-  size_t memory_size_;
-};
-
-inline void SerializeDestroyCallback(void* payload) {
-  if (payload != nullptr) {
-    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
-    delete shared_payload;
-  }
-}
-
-TensorPayload GetTensorPayload(framework::Variable* var,
-                               const platform::DeviceContext& ctx,
-                               VarMsg* request);
-
-TensorPayload GetSelectedRowsPayload(framework::Variable* var,
-                                     const platform::DeviceContext& ctx,
-                                     VarMsg* request);
-
-inline framework::proto::VarType::Type ToVarType(
-    sendrecv::VariableMessage::Type type) {
-  switch (type) {
-    case sendrecv::VariableMessage::FP32:
-      return framework::proto::VarType::FP32;  // NOLINT
-    case sendrecv::VariableMessage::FP64:
-      return framework::proto::VarType::FP64;  // NOLINT
-    case sendrecv::VariableMessage::INT32:
-      return framework::proto::VarType::INT32;  // NOLINT
-    case sendrecv::VariableMessage::INT64:
-      return framework::proto::VarType::INT64;  // NOLINT
-    case sendrecv::VariableMessage::BOOL:
-      return framework::proto::VarType::BOOL;  // NOLINT
-    default:
-      PADDLE_THROW(
-          platform::errors::InvalidArgument("Not support type id: %d.", type));
-  }
-}
-
-template <template <typename> class T, typename Elem>
-std::string VectorElemName(const T<Elem>& arg) {
-  return typeid(Elem).name();
-}
-
-}  // namespace distributed
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/varhandle_test.cc b/paddle/fluid/operators/distributed/varhandle_test.cc
deleted file mode 100644
index 7c52ef74b4c2e5e060a2a46ea8c2ebc727f0cbae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/varhandle_test.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-
-using paddle::operators::distributed::VarHandlePtr;
-using paddle::operators::distributed::VarHandle;
-
-void WaitTrue(VarHandlePtr s) { EXPECT_TRUE(s->Wait()); }
-
-void WaitFalse(VarHandlePtr s) { EXPECT_FALSE(s->Wait()); }
-
-TEST(VarHandle, Run) {
-  std::vector<VarHandlePtr> a;
-  for (int i = 0; i < 12; i++) {
-    VarHandlePtr s(new VarHandle("", "", "", nullptr, nullptr));
-    a.push_back(s);
-  }
-
-  std::vector<std::unique_ptr<std::thread>> t;
-  for (int i = 0; i < 6; i++) {
-    t.emplace_back(new std::thread(WaitFalse, a[i]));
-  }
-
-  for (int i = 0; i < 6; i++) {
-    a[i]->Finish(false);
-    t[i]->join();
-  }
-
-  for (int i = 6; i < 12; i++) {
-    t.emplace_back(new std::thread(WaitTrue, a[i]));
-  }
-
-  for (int i = 6; i < 12; i++) {
-    a[i]->Finish(true);
-    t[i]->join();
-  }
-}
diff --git a/paddle/fluid/operators/distributed/variable_response.cc b/paddle/fluid/operators/distributed/variable_response.cc
deleted file mode 100644
index 79b0843968e8578bc1dd865d6c2530c473a57bbe..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/distributed/variable_response.h"
-#include <vector>
-#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
-
-DEFINE_string(rpc_server_profile_path, "./profile_ps",
-              "the profile log file path");
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
-                               const platform::DeviceContext& dev_ctx,
-                               platform::Place place, void* dest,
-                               int64_t size) {
-  const void* data = NULL;
-  int size_to_write = 0;
-  int64_t length = size;
-  int total_written = 0;
-
-  if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto& gpu_dev_ctx =
-        static_cast<const platform::CUDADeviceContext&>(dev_ctx);
-    platform::CPUPlace cpu;
-
-    char* p = reinterpret_cast<char*>(dest);
-    while (total_written < length) {
-      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-        return false;
-      }
-      // NOTE: if raw buffer is large and have two neighbor fields of raw
-      // buffers GetDirectBufferPointer can get all of them, use length to
-      // truncate it.
-      if (total_written + size_to_write > length) {
-        size_to_write = length - total_written;
-      }
-      // This log is useful to see how long a internal block size is of rpc.
-      VLOG(7) << "copy " << size_to_write << " data to CUDAPlace";
-      memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place),
-                   reinterpret_cast<void*>(p), cpu, data, size_to_write,
-                   gpu_dev_ctx.stream());
-      p += size_to_write;
-      total_written += size_to_write;
-
-      input->Skip(size_to_write);
-    }
-    gpu_dev_ctx.Wait();
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Unexpected branch, please compile with WITH_GPU or WITH_ROCM"));
-#endif
-    return true;
-  } else if (platform::is_xpu_place(place)) {
-#ifdef PADDLE_WITH_XPU
-    auto& xpu_dev_ctx = static_cast<const platform::XPUDeviceContext&>(dev_ctx);
-    platform::CPUPlace cpu;
-    char* p = reinterpret_cast<char*>(dest);
-    while (total_written < length) {
-      if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-        return false;
-      }
-
-      if (total_written + size_to_write > length) {
-        size_to_write = length - total_written;
-      }
-
-      memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place),
-                   reinterpret_cast<void*>(p), cpu, data, size_to_write);
-      p += size_to_write;
-      total_written += size_to_write;
-      input->Skip(size_to_write);
-    }
-    xpu_dev_ctx.Wait();
-#else
-    PADDLE_ENFORCE_NOT_NULL(
-        nullptr,
-        platform::errors::Unimplemented(
-            "Not supported XPU, please compile with option WITH_XPU=ON."));
-#endif
-    return true;
-  }
-
-  char* p = reinterpret_cast<char*>(dest);
-  while (total_written < length) {
-    if (!input->GetDirectBufferPointer(&data, &size_to_write)) {
-      return false;
-    }
-    // NOTE: if raw buffer is large and have two neighbor fields of raw buffers
-    // GetDirectBufferPointer can get all of them, use length to truncate it.
-    if (total_written + size_to_write > length) {
-      size_to_write = length - total_written;
-    }
-    // TODO(gongwb): can we avoid copy?
-    platform::CPUPlace cpu;
-    // This log is useful to see how long a internal block size is of rpc.
-    VLOG(7) << "copy " << size_to_write << " data to CPUPlace";
-    memory::Copy(cpu, reinterpret_cast<void*>(p), cpu, data, size_to_write);
-
-    p += size_to_write;
-    total_written += size_to_write;
-
-    input->Skip(size_to_write);
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopyLodTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto server_var = GetVar();
-  if (!server_var) {
-    LOG(ERROR) << "recved var should not on current server: "
-               << meta_.varname();
-    return false;
-  }
-  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
-  tensor->Resize(dims);
-  framework::LoD lod;
-  for (int i = 0; i < meta_.lod_level(); ++i) {
-    framework::Vector<size_t> v;
-    for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) {
-      v.push_back(meta_.lod(i).lod_data(j));
-    }
-    lod.push_back(v);
-  }
-  tensor->set_lod(lod);
-
-  void* tensor_data =
-      tensor->mutable_data(ctx.GetPlace(), ToVarType(meta_.data_type()));
-
-  VLOG(6) << "Tensor.memory_size = " << tensor->memory_size()
-          << ", Buffer Size = " << length << ", dims:" << dims
-          << ", numel:" << tensor->numel();
-  PADDLE_ENFORCE_GE(
-      tensor->memory_size(), static_cast<unsigned int>(length),
-      platform::errors::InvalidArgument(
-          "The memory size of tensor: %s should greater than length: %s",
-          tensor->memory_size(), length));
-  return ReadRaw(input, ctx, tensor->place(), tensor_data, length);
-}
-
-inline framework::DDim GetDims(
-    const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) {
-  std::vector<int> vecdims;
-  for (auto& d : dims) {
-    vecdims.push_back(d);
-  }
-  return framework::make_ddim(vecdims);
-}
-
-bool VariableResponse::CopySelectRowsTensorData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, const framework::DDim& dims,
-    int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->set_height(meta_.slr_height());
-  auto* tensor = slr->mutable_value();
-  tensor->Resize(dims);
-  PADDLE_ENFORCE_EQ(
-      static_cast<size_t>(tensor->numel()),
-      length / framework::SizeOfType(paddle::operators::distributed::ToVarType(
-                   meta_.data_type())),
-      platform::errors::InvalidArgument(
-          "length: %s should equal to memory size of tensor: %s", length,
-          tensor->numel() *
-              framework::SizeOfType(paddle::operators::distributed::ToVarType(
-                  meta_.data_type()))));
-  void* tensor_data = tensor->mutable_data(
-      ctx.GetPlace(),
-      paddle::operators::distributed::ToVarType(meta_.data_type()));
-
-  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool VariableResponse::CopySelectRowsData(
-    ::google::protobuf::io::CodedInputStream* input,
-    const platform::DeviceContext& ctx, int length) {
-  auto* slr = GetVar()->GetMutable<framework::SelectedRows>();
-  slr->mutable_rows()->clear();
-  slr->mutable_rows()->resize(length / sizeof(int64_t));  // int64
-  int64_t* rows_data = slr->mutable_rows()->data();
-
-  // copy rows CPU data, GPU data will be copied lazily.
-  platform::CPUPlace cpu;
-  if (!ReadRaw(input, ctx, cpu, rows_data, length)) {
-    return false;
-  }
-
-  return true;
-}
-
-bool VariableResponse::ProcSerializedField(
-    int tag, ::google::protobuf::io::CodedInputStream* input,
-    int64_t num_bytes) {
-  PADDLE_ENFORCE(
-      (meta_.type() == sendrecv::SELECTED_ROWS ||
-       meta_.type() == sendrecv::LOD_TENSOR ||
-       meta_.type() == sendrecv::NCCL_ID) &&
-          meta_.varname() != "",
-      platform::errors::PreconditionNotMet("meta info should be got first!"));
-
-  if (meta_.type() == sendrecv::NCCL_ID) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto* var = scope_->FindVar(meta_.varname());
-    if (var != nullptr) {
-      ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
-      if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal,
-                   num_bytes)) {
-        return false;
-      }
-    }
-    return true;
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Please compiled with CUDA!"));
-    return false;
-#endif
-  }
-
-  VLOG(7) << "ProcSerializedField:" << meta_.varname()
-          << ", type:" << meta_.type() << std::endl;
-  framework::DDim dims = GetDims(meta_.dims());
-  if (meta_.type() == sendrecv::LOD_TENSOR) {
-    PADDLE_ENFORCE_GE(
-        meta_.lod_size(), 0,
-        platform::errors::PreconditionNotMet("lod info should be got first!"));
-    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
-      return false;
-    }
-
-    return true;
-  }
-
-  if (meta_.type() == sendrecv::SELECTED_ROWS) {
-    if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) {
-      return false;
-    }
-    return true;
-  }
-
-  PADDLE_THROW(platform::errors::InvalidArgument(
-      "The type: %s of var: %s is not supported", meta_.type(),
-      meta_.varname()));
-
-  return false;
-}
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h
deleted file mode 100644
index be67a2396f7d713aad3b6bc46210cd9345d3dc55..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed/variable_response.h
+++ /dev/null
@@ -1,155 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "google/protobuf/io/coded_stream.h"
-#include "google/protobuf/io/zero_copy_stream.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/distributed/distributed_pb.h"
-
-namespace google {
-namespace protobuf {
-namespace io {
-class CodedInputStream;
-class ZeroCopyInputStream;
-}  // namespace io
-}  // namespace protobuf
-}  // namespace google
-namespace paddle {
-namespace framework {
-class Variable;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-DECLARE_string(rpc_server_profile_path);
-
-namespace paddle {
-namespace operators {
-namespace distributed {
-
-// Source provides a way for a particular RPC implementation to provide
-// received data to ParseFrom.
-class Source {
- public:
-  virtual ~Source() {}
-
-  // Return the stream that contains the data to be parsed.
-  // Note that this method might be invoked more than once if
-  // ParseFrom needs to fall back to a more expensive parsing method.
-  // Every call must return a stream pointing at the beginning of
-  // the serialized RecvTensorResponse.
-  //
-  // Note that a subsequent call to contents() invalidates previous
-  // results of contents().
-  //
-  // Ownership of the returned stream is retained by the Source and
-  // should not be deleted by the caller.
-  virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
-};
-
-class VariableResponse {
- public:
-  VariableResponse(const framework::Scope* scope,
-                   const platform::DeviceContext* dev_ctx,
-                   bool create_scope = false)
-      : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) {
-    if (create_scope) {
-      local_scope_ = scope->NewTmpScope().release();
-    }
-  }
-
-  virtual ~VariableResponse() {
-    if (local_scope_) {
-      delete local_scope_;
-      local_scope_ = nullptr;
-    }
-  }
-
-  int Parse(Source* source, const sendrecv::VariableMessage& meta) {
-    meta_ = meta;
-    return Parse(source);
-  }
-
-  // return:
-  // 0:ok.
-  // -1: unkown error.
-  // other: number of error field.
-  virtual int Parse(Source* source) = 0;
-
-  inline const framework::Scope& GetLocalScope() const { return *local_scope_; }
-  inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
-  inline std::string Varname() const { return meta_.varname(); }
-  inline std::string OutVarname() const { return meta_.out_varname(); }
-  inline std::string TableName() const { return meta_.table_name(); }
-
-  // should call parse first.
-  framework::Variable* GetVar() {
-    if (create_scope_) {
-      return local_scope_->Var(meta_.varname());
-    }
-    return scope_->FindVar(meta_.varname());
-  }
-
-  framework::Variable* GetRecvVar() {
-    if (create_scope_) {
-      return local_scope_->Var(meta_.out_varname());
-    }
-    return scope_->FindVar(meta_.out_varname());
-  }
-
-  int GetTrainerId() { return static_cast<int>(meta_.trainer_id()); }
-
- protected:
-  bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
-               const platform::DeviceContext& dev_ctx, platform::Place place,
-               void* dest, int64_t size);
-
-  bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
-                                const platform::DeviceContext& ctx,
-                                const framework::DDim& dims, int length);
-
-  bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input,
-                          const platform::DeviceContext& ctx, int length);
-
-  bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input,
-                         const platform::DeviceContext& ctx,
-                         const framework::DDim& dims, int length);
-
-  bool ProcSerializedField(int tag,
-                           ::google::protobuf::io::CodedInputStream* input,
-                           int64_t num_bytes);
-
- protected:
-  const framework::Scope* scope_;
-  const platform::DeviceContext* dev_ctx_;
-  bool create_scope_ = false;
-  framework::Scope* local_scope_ = nullptr;
-
-  sendrecv::VariableMessage meta_;
-};
-
-};  // namespace distributed
-};  // namespace operators
-};  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
deleted file mode 100644
index e651f19fedbcf88b85962e16fa691d19f8db7cf2..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-include(operators)
-
-set(DISTRIBUTE_DEPS "")
-if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr zlib protobuf node)
-else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb protobuf ssl crypto zlib node)
-    if(WITH_BRPC_RDMA)
-        find_library(IBVERBS_LIBRARY NAMES ibverbs)
-        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
-
-
-        find_library(RDMACM_LIBRARY NAMES rdmacm)
-        ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
-        SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
-
-        set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
-    endif()
-endif()
-
-set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
-
-file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-list(REMOVE_DUPLICATES OPS)
-
-foreach(src ${OPS})
-    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-endforeach()
-
-register_operators(EXCLUDES gen_nccl_id_op DEPS ${DISTRIBUTE_DEPS})
-
-if(WITH_NCCL OR WITH_RCCL)
-    set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} nccl_common)
-endif()
-
-set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
-set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.cc b/paddle/fluid/operators/distributed_ops/allreduce_op.cc
deleted file mode 100644
index 86f1c28a9dd4f53400418c93f8598b7a9c38f4cc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/operators/distributed_ops/allreduce_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AllReduceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class AllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be allreduced.");
-    AddOutput("Out", "(Tensor) the result of allreduced.");
-    AddAttr<int>("reduce_type", "(int) determin the reduce type.")
-        .SetDefault(0);
-    AddAttr<bool>(
-        "sync_mode",
-        "(bool) whether to synchronize the CUDA stream after nccl call.")
-        .SetDefault(false);
-    AddComment(R"DOC(
-***AllReduce Operator***
-
-Call NCCL AllReduce internally. Note that this op must be used when one
-thread is managing one GPU device.
-
-For speed reasons, reduce_type should be an integer:
-
-0: sum
-1: prod
-2: max
-3: min
-
-If input and output are the same variable, in-place allreduce will be used.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(allreduce, ops::AllReduceOp,
-                             ops::AllReduceOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    allreduce, ops::AllReduceOpKernel<plat::CPUDeviceContext, float>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, double>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, int64_t>,
-    ops::AllReduceOpKernel<plat::CPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
deleted file mode 100644
index 157924f08546bfcbe4f9df14588f9d462dd6677b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/allreduce_op.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class AllReduceOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto place = ctx.GetPlace();
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "AllReduce op can run on gpu place only for now."));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-
-    int dtype = platform::ToNCCLDataType(in->type());
-    int64_t numel = in->numel();
-    auto* sendbuff = in->data<void>();
-    out->Resize(in->dims());
-    void* recvbuff = out->mutable_data<T>(place);
-
-    auto* comm = dev_ctx.nccl_comm();
-    // FIXME(typhoonzero): should use nccl stream here.
-    auto stream = dev_ctx.stream();
-    PADDLE_ENFORCE_NOT_NULL(
-        stream, platform::errors::NotFound("Should initialize NCCL firstly."));
-
-    int reduce_type = ctx.Attr<int>("reduce_type");
-    ncclRedOp_t red_type = ncclSum;
-    switch (reduce_type) {
-      case 0:
-        red_type = ncclSum;
-        break;
-      case 1:
-        red_type = ncclProd;
-        break;
-      case 2:
-        red_type = ncclMax;
-        break;
-      case 3:
-        red_type = ncclMin;
-        break;
-    }
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
-        comm, stream));
-    if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cc
deleted file mode 100644
index 61e27887b68c75f3d5c5cc48b4f1fac11d5f4eae..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <ostream>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class BroadcastOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of BroadcastOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Output) of ConvOp should not be null."));
-  }
-};
-
-class BroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor), tensor to be broadcast.");
-    AddOutput("Out", "(Tensor) the result of broadcast.");
-    AddAttr<bool>(
-        "sync_mode",
-        "(bool) whether to synchronize the CUDA stream after nccl call.")
-        .SetDefault(false);
-    AddAttr<int>("root", "(int).").SetDefault(0).EqualGreaterThan(0);
-    AddComment(R"DOC(
-***Broadcast Operator***
-
-Call NCCL Broadcast internally. Note that this op must be used when one
-thread is managing one GPU device.
-)DOC");
-  }
-};
-
-template <typename T>
-class BroadcastOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Broadcast op can run on gpu place only for now."));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_WITHOUT_GRADIENT(broadcast, ops::BroadcastOp,
-                             ops::BroadcastOpMaker);
-
-REGISTER_OP_CPU_KERNEL(broadcast, ops::BroadcastOpKernel<float>,
-                       ops::BroadcastOpKernel<double>,
-                       ops::BroadcastOpKernel<int>,
-                       ops::BroadcastOpKernel<int64_t>,
-                       ops::BroadcastOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc b/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
deleted file mode 100644
index 1bfcc8af03e1e728c9179fd85229c842e3f0a446..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/nccl_helper.h"
-#endif
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class NCCLBroadcastOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::PreconditionNotMet(
-            "The place of ExecutionContext should be CUDAPlace."));
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    int dev_id = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).device;
-    int root_dev_id = ctx.Attr<int>("root");
-
-    auto in = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    PADDLE_ENFORCE_EQ(
-        out->IsInitialized(), true,
-        platform::errors::PreconditionNotMet(
-            "Currently, the output of broadcast op must be initialized,"
-            "because this op can only be an In-Place operation."));
-    void* send_recv_buffer = out->mutable_data<T>(ctx.GetPlace());
-    PADDLE_ENFORCE_EQ(
-        send_recv_buffer, in->data<void>(),
-        platform::errors::PreconditionNotMet("Currently, the broadcast op can "
-                                             "only be an In-Place operation."));
-
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto comm = dev_ctx.nccl_comm();
-    auto stream = dev_ctx.stream();
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
-        send_recv_buffer, static_cast<size_t>(in->numel()),
-        platform::ToNCCLDataType(in->type()), root_dev_id, comm, stream));
-
-    VLOG(3) << "Bcast " << ctx.InputNames("X")[0] << ", (" << in->numel() << ")"
-            << " From " << root_dev_id << " to " << dev_id;
-
-    if (ctx.Attr<bool>("sync_mode")) {
-#ifdef PADDLE_WITH_RCCL
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(broadcast, ops::NCCLBroadcastOpKernel<float>,
-                        ops::NCCLBroadcastOpKernel<double>,
-                        ops::NCCLBroadcastOpKernel<int>,
-                        ops::NCCLBroadcastOpKernel<int64_t>,
-                        ops::NCCLBroadcastOpKernel<plat::float16>);
diff --git a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc b/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
deleted file mode 100644
index 051d9d65c7714ac13afa1d49f5584fad7bcaa789..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class CheckpointNotifyOp : public framework::OperatorBase {
- public:
-  CheckpointNotifyOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    std::vector<std::string> epmap =
-        Attr<std::vector<std::string>>("endpoints");
-    std::string dirname = Attr<std::string>("dirname");
-    std::string varname = Attr<std::string>("varname");
-    auto mode = Attr<int>("mode");
-
-    if (mode != 0 && mode != 1 && mode != 2) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "mode expected in [0/1/2], but got %d", mode));
-    }
-
-    std::vector<std::string> slice_varnames =
-        Attr<std::vector<std::string>>("slice_varnames");
-
-    std::vector<std::string> remote_varnames =
-        Attr<std::vector<std::string>>("remote_varnames");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-    for (size_t i = 0; i < epmap.size(); i++) {
-      auto save_path =
-          string::Sprintf("%s/%s/%s", dirname, varname, slice_varnames[i]);
-
-      rpc_client->AsyncCheckpointNotify(epmap[i], save_path, remote_varnames[i],
-                                        mode);
-
-      VLOG(3) << "checkpoint notify sending with path: " << save_path
-              << " and var:" << slice_varnames[i] << " to " << epmap[i]
-              << " with mode " << mode;
-    }
-    PADDLE_ENFORCE_EQ(
-        rpc_client->Wait(), true,
-        platform::errors::Fatal("Fail to notify checkpoint."
-                                " Internal error occurs in RPCClient."));
-  }
-};
-
-class CheckpointNotifyOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddAttr<std::vector<std::string>>(
-        "endpoints",
-        "(string vector)"
-        "Parameter Server endpoints in the order");
-    AddAttr<std::string>("dirname",
-                         "(string) indicate the folder checkpoint will use");
-    AddAttr<std::string>("varname", "(string)  the var need to be saved");
-    AddAttr<std::vector<std::string>>(
-        "slice_varnames", "(string vector) the slice vars need to be saved");
-    AddAttr<std::vector<std::string>>(
-        "remote_varnames", "(string vector) the slice vars need to be saved");
-    AddAttr<int>("mode", "mode=0/1/2 means nothing/save base/save delta")
-        .SetDefault(0);
-    AddComment(R"DOC(
-CheckpointNotify operator
-This operator will send lookup table and it's checkpoint direcoty to listen_and_serve op at
-the parameter server.
-)DOC");
-  }
-};
-
-class CheckpointNotifyOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    checkpoint_notify, ops::CheckpointNotifyOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::CheckpointNotifyOpMaker, ops::CheckpointNotifyOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
deleted file mode 100644
index 6dfa2670c140fcfb4c409c0f9e9cef49c02a7064..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class DistributedLookupTableOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("Ids"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Ids) of LookupTableOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(W) of LookupTableOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Outputs"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(Outs) of LookupTableOp should not be null."));
-
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    auto table_dims = ctx->GetInputDim("W");
-
-    PADDLE_ENFORCE_EQ(
-        table_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Only 2 dimensions of the 'Embedding' is supported."));
-
-    for (auto &ids_dim : ids_dims) {
-      PADDLE_ENFORCE_EQ(ids_dim.size(), 2,
-                        platform::errors::InvalidArgument(
-                            "The dimension of the 'Ids' tensor must be 2."));
-    }
-
-    auto endpoints = ctx->Attrs().Get<std::vector<std::string>>("endpoints");
-    // for fluid.embedding
-    auto lookup_table_version =
-        ctx->Attrs().Get<std::string>("lookup_table_version");
-
-    auto outputs_dims = std::vector<framework::DDim>();
-
-    for (auto &ids_dim : ids_dims) {
-      if (lookup_table_version == "lookup_table") {
-        outputs_dims.push_back(
-            framework::make_ddim({ids_dim[0], table_dims[1]}));
-      } else if (lookup_table_version == "lookup_table_v2") {
-        outputs_dims.push_back(framework::make_ddim(
-            {static_cast<int64_t>(ids_dim[0]), static_cast<int64_t>(ids_dim[1]),
-             static_cast<int64_t>(table_dims[1])}));
-      }
-    }
-
-    ctx->SetOutputsDim("Outputs", outputs_dims);
-    ctx->ShareLoD("Ids", /*->*/ "Outputs");
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        ctx.GetPlace());
-  }
-};
-
-class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.")
-        .AsDuplicable();
-
-    AddInput("W",
-             "(Tensor) The input represents embedding tensors, "
-             "which is a learnable parameter.");
-
-    AddOutput("Outputs",
-              "(LoDTensor) The lookup results, which have the same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, such as emb_block0, emb_block1)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({""});
-    AddAttr<std::vector<std::string>>(
-        "endpoints",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-
-    AddAttr<int>("pserver_num", "the number of pserver").SetDefault(0);
-
-    AddAttr<bool>("is_distributed",
-                  "(boolean, default false) distributed lookup table.")
-        .SetDefault(false);
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-
-    AddAttr<std::string>(
-        "lookup_table_version",
-        "(string, default lookup_table) "
-        "To distinguish between different versions of embedding OP")
-        .SetDefault(std::string("lookup_table"));
-
-    AddAttr<int64_t>("padding_idx",
-                     "(int64, default -1) "
-                     "If the value is -1, it makes no effect to lookup. "
-                     "Otherwise the given value indicates padding the output "
-                     "with zeros whenever lookup encounters it in Ids.")
-        .SetDefault(distributed::kNoPadding);
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-
-    AddComment(R"DOC(
-Lookup Tablel Prefetch Operator.
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp,
-                  ops::DistributedLookupTableOpMaker);
-
-REGISTER_OP_CPU_KERNEL(distributed_lookup_table,
-                       ops::DistributedLookupTableKernel<
-                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
deleted file mode 100644
index 54c894415096e869f363eda6a1de2a473e839263..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    distributed_lookup_table,
-    ops::DistributedLookupTableKernel<plat::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
deleted file mode 100644
index 6387120bc87fc94f40574a3ab7f0aabc98f41e95..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-     http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class DistributedLookupTableKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto ids_vars = context.MultiInputVar("Ids");
-    auto emb_vars = context.MultiOutput<framework::Tensor>("Embeddings");
-
-    auto id_names = context.InputNames("Ids");
-    auto embedding_name = context.InputNames("W").front();
-    auto out_names = context.OutputNames("Outputs");
-    auto lookup_tables = context.Attr<std::vector<std::string>>("table_names");
-    auto endpoints = context.Attr<std::vector<std::string>>("endpoints");
-    auto is_distributed = context.Attr<bool>("is_distributed");
-
-    auto lookup_table_version =
-        context.Attr<std::string>("lookup_table_version");
-
-    operators::distributed::prefetchs(id_names, out_names, embedding_name,
-                                      is_distributed, lookup_tables, endpoints,
-                                      context, context.scope());
-
-    if (lookup_table_version == "lookup_table_v2") {
-      auto &scope = context.scope();
-      auto emb_dim =
-          scope.FindVar(embedding_name)->Get<framework::LoDTensor>().dims()[1];
-
-      for (size_t i = 0; i < id_names.size(); ++i) {
-        auto *id_var = scope.FindVar(id_names[i]);
-        auto *out_var = scope.FindVar(out_names[i]);
-        auto *id_tensor = id_var->GetMutable<framework::LoDTensor>();
-        auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-        auto id_dims = id_tensor->dims();
-        out_tensor->Resize(framework::make_ddim(
-            {static_cast<int64_t>(id_dims[0]), static_cast<int64_t>(id_dims[1]),
-             static_cast<int64_t>(emb_dim)}));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
deleted file mode 100644
index cb27dc75eb2faf15746e596265d1b1e4b3717e52..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class FakeInitInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FakeInit");
-    auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
-    ctx->SetOutputDim("Out", framework::make_ddim(shape));
-  }
-};
-
-class FakeInitOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    framework::Tensor *tensor = nullptr;
-
-    auto &out_var = *scope.FindVar(Output("Out"));
-
-    if (out_var.IsType<framework::LoDTensor>()) {
-      tensor = out_var.GetMutable<framework::LoDTensor>();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else if (out_var.IsType<framework::SelectedRows>()) {
-      tensor = out_var.GetMutable<framework::SelectedRows>()->mutable_value();
-      tensor->Resize(framework::make_ddim(Attr<std::vector<int64_t>>("shape")));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "fake init op's output only"
-          "supports SelectedRows and LoDTensor"));
-    }
-  }
-};
-
-class FakeInitOpVarTypeInference : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {}
-};
-
-class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output");
-    AddOutput("Out",
-              "(Tensor) Tensor of specified shape will be filled "
-              "with the specified value");
-    AddComment(R"DOC(
-FakeInit Operator.
-Init an variable but not alloc memory for it, it is used for init the
-table parameter at trainer side in distributed lookup table.
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    fake_init, ops::FakeInitOp, ops::FakeInitInferShape, ops::FakeInitOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::FakeInitOpVarTypeInference);
diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
deleted file mode 100644
index 755cbf017d9d4be5e4d75024e29b09beecf53db4..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class FetchBarrierOp : public framework::OperatorBase {
- public:
-  FetchBarrierOp(const std::string& type,
-                 const framework::VariableNameMap& inputs,
-                 const framework::VariableNameMap& outputs,
-                 const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (auto& ep : eps) {
-      VLOG(3) << "fetch barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendFetchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U,
-                        platform::errors::Unavailable(
-                            "Internal error occurred in RPCClient."));
-    }
-  }
-};
-
-class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDispensable()
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SendBarrier operator
-
-This operator will send a send barrier signal to list_and_serv op, so that
-the Parameter Server would knew all variables have been sent.
-)DOC");
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({"127.0.0.1:6164"});
-  }
-};
-
-class FetchBarrierOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    fetch_barrier, ops::FetchBarrierOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::FetchBarrierOpMaker, ops::FetchBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
deleted file mode 100644
index 2e54bb3961cd297e11837388b56cf729740ce078..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(flrpc_send_thread_num, 12, "number of threads for rpc send");
-DEFINE_int32(flrpc_get_thread_num, 12, "number of threads for rpc get");
-
-namespace paddle {
-namespace operators {
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service) {
-  service->StartServer();
-}
-static void flsplit(const std::string &str, char sep,
-                    std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-static void FlParallelExecuteBlocks(
-    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-        &prepared,
-    framework::ProgramDesc *program, framework::Scope *scope) {
-  std::vector<std::future<void>> fs;
-  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
-        executor->RunPreparedContext(prepared[run_block].get(), scope);
-      } catch (const std::exception &e) {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Run %d-th sub program failed. The exception is:\n%s.", idx,
-            e.what()));
-      }
-    }));
-  }
-  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-}
-
-FlListenAndServOp::FlListenAndServOp(const std::string &type,
-                                     const framework::VariableNameMap &inputs,
-                                     const framework::VariableNameMap &outputs,
-                                     const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-FlListenAndServOp::~FlListenAndServOp() {}
-
-void FlListenAndServOp::SavePort() const {
-  // NOTE: default write file to /tmp/paddle.selected_port
-  rpc_service_->SavePort();
-}
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
-void FlListenAndServOp::RunSyncLoop(framework::Executor *executor,
-                                    framework::ProgramDesc *program,
-                                    framework::Scope *recv_scope,
-                                    platform::DeviceContext *dev_ctx) const {
-  VLOG(2) << "RunSyncLoop";
-  size_t num_blocks = program->Size();
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    platform::errors::InvalidArgument(
-                        "server program should have at least 2 blocks"));
-
-  // Prepare all the server block
-  std::vector<int> optimize_blocks_list;
-  for (size_t i = 1; i < program->Size(); ++i) {
-    optimize_blocks_list.push_back(i);
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
-  // Insert placeholder for block0 which holds current op itself,
-  // NOTE the first block in `optimize_prepared` should never be ran.
-  optimize_prepared.insert(
-      optimize_prepared.begin(),
-      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-
-  while (true) {
-    // Get from multiple trainers, we don't care about the order in which
-    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    VLOG(3) << "wait all clients to get pserver parameters back";
-    rpc_service_->SetCond(distributed::kRequestGet);
-    VLOG(3) << "wait all clients to send fetch_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-
-    if (rpc_service_->IsExit()) {
-      rpc_service_->SetCond(distributed::kRequestGet);
-      break;
-    }
-
-    VLOG(3) << "wait all clients to send after_optimizer parameters";
-    rpc_service_->SetCond(distributed::kRequestSend);
-    VLOG(3) << "wait all clients to send send_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestSend);
-    VLOG(3) << "ResetBarrierCounter";
-    rpc_service_->ResetBarrierCounter();
-    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
-    // and this will still work.
-    // The optimize blocks which have the same parent ID would run parallel
-    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
-    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(optimize_blocks[0]->ID());
-    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
-      // skip the first optimize block because it is already in the
-      // parallel_blkids.
-      int blkid = optimize_blocks[i]->ID();
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                                program, recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
-      }
-      parallel_blkids.push_back(blkid);
-    }
-    FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                            program, recv_scope);
-    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-  }  // while(true)
-}
-
-static void FillRequestCtx(distributed::RequestHandler *h,
-                           framework::Scope *scope,
-                           platform::DeviceContext *dev_ctx,
-                           framework::Executor *executor,
-                           framework::ProgramDesc *program,
-                           distributed::RPCServer *rpc_server) {
-  h->SetScope(scope);
-  h->SetDevCtx(dev_ctx);
-  h->SetExecutor(executor);
-  h->SetProgram(program);
-  h->SetRPCServer(rpc_server);
-}
-
-void FlListenAndServOp::RunImpl(const framework::Scope &scope,
-                                const platform::Place &dev_place) const {
-  // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileListener();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  bool sync_mode = Attr<bool>("sync_mode");
-  auto fan_in = Attr<int>("Fanin");
-  auto inputs = Inputs("X");
-
-  PADDLE_ENFORCE_EQ(!rpc_service_, true, platform::errors::InvalidArgument(
-                                             "rpc_service_ must null"));
-  std::string endpoint = Attr<std::string>("endpoint");
-
-  VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint;
-
-  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-
-  request_send_handler_.reset(
-      new distributed::RequestSendHandler(!sync_mode, false));
-  request_get_handler_.reset(
-      new distributed::RequestGetHandler(!sync_mode, false));
-
-  rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get(),
-                            FLAGS_flrpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get(),
-                            FLAGS_flrpc_get_thread_num);
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(
-      optimize_blocks.size(), 1,
-      platform::errors::InvalidArgument(
-          "optimize blocks should be 1 at least on the pserver side."));
-  auto *program = optimize_blocks[0]->Program();
-  framework::Executor executor(dev_place);
-
-  auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope,
-                     &dev_ctx, &executor, program, rpc_service_.get());
-
-  f(request_send_handler_.get());
-  f(request_get_handler_.get());
-
-  // start the server listening after all member initialized.
-  server_thread_.reset(new std::thread(FlRunServer, rpc_service_));
-  VLOG(3) << "wait server thread to become ready...";
-  rpc_service_->WaitServerReady();
-
-  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
-  signal(SIGINT, FlSignalHandler::StopAndExit);
-  signal(SIGTERM, FlSignalHandler::StopAndExit);
-
-  // Cache the type of the received vars as `sparse_vars_` and `dense_vars_`
-  // so that we can reset them at the end of each iteration.
-  // NOTE: only used in sync update
-
-  // Write to a file of server selected port for python use.
-  SavePort();
-  RunSyncLoop(&executor, program, &recv_scope, &dev_ctx);
-}
-
-class FlListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
-" will start a RPC server which can receive variables from send_op and send" +
-"back variables to recv_op.)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<bool>("sync_mode", "if works at sync_mode or not").SetDefault(true);
-    AddAttr<int>("Fanin", "How many clients send to this server.")
-        .SetDefault(1);
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        kOptimizeBlocks, "Optimize blocks to run on server side.")
-        .SetDefault({});
-  }
-};
-
-void FlSignalHandler::StopAndExit(int signal_num) {
-  // Do not use VLOG here for the device for printing maybe already released.
-  // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-  exit(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(fl_listen_and_serv, ops::FlListenAndServOp,
-                             ops::FlListenAndServOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
deleted file mode 100644
index 25ad16e3fce37837838af5360f79b9bece337373..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class Executor;
-class ProgramDesc;
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCServer;
-class RequestHandler;
-}  // namespace distributed
-
-constexpr char kOptimizeBlocks[] = "optimize_blocks";
-
-void FlRunServer(std::shared_ptr<distributed::RPCServer> service);
-
-template <class TKey, class TValue>
-class DoubleFindMap : public std::unordered_map<TKey, TValue> {
- public:
-  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
-    return std::find_if(this->begin(), this->end(),
-                        [&v](const std::pair<const std::string, int> p) {
-                          return p.second == v;
-                        });
-  }
-};
-
-class FlListenAndServOp : public framework::OperatorBase {
- public:
-  FlListenAndServOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs);
-  virtual ~FlListenAndServOp();
-
-  void RunSyncLoop(framework::Executor* executor,
-                   framework::ProgramDesc* program,
-                   framework::Scope* recv_scope,
-                   platform::DeviceContext* dev_ctx) const;
-
-  void SavePort() const;
-
-  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override;
-
- protected:
-  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-
-  mutable std::shared_ptr<std::thread> server_thread_;
-  mutable std::vector<std::string> sparse_vars_;
-  mutable std::vector<std::string> dense_vars_;
-};
-
-class FlSignalHandler {
- public:
-  static void StopAndExit(int signal_num);
-
- private:
-  DISABLE_COPY_AND_ASSIGN(FlSignalHandler);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc b/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
deleted file mode 100644
index db8c2f3f2d8660b0390cb35d76b2ef800c631788..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc
+++ /dev/null
@@ -1,313 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <ostream>
-#include <string>
-
-#include "glog/logging.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed/rpc_client.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace operators {
-
-class GenNCCLIdOp : public framework::OperatorBase {
- public:
-  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
-              const framework::VariableNameMap& outputs,
-              const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    // put nccl id in CPUPlace
-    auto& dev_ctx = *pool.Get(platform::CPUPlace());
-    int trainer_id = Attr<int>("trainer_id");
-
-    std::vector<std::string> trainers =
-        Attr<std::vector<std::string>>("trainers");
-    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
-                                         "trainer_id %d is less than 0. Its "
-                                         "valid range is [0, trainer_size)"));
-    PADDLE_ENFORCE_LT(
-        trainer_id, static_cast<int>(trainers.size()),
-        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
-                                     "range is [0, trainer_size)",
-                                     trainer_id));
-
-    std::string endpoint = trainers[trainer_id];
-
-    framework::Scope& local_scope = scope.NewScope();
-
-    int nccl_comm_num = Attr<int>("nccl_comm_num");
-    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
-    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
-
-    int inter_trainer_id = -1;
-    int exter_trainer_id = -1;
-    if (use_hierarchical_allreduce) {
-      PADDLE_ENFORCE_GT(
-          trainers.size(), 1,
-          platform::errors::PreconditionNotMet(
-              "The number of collective trainers %llu <= 1", trainers.size()));
-      PADDLE_ENFORCE_GT(
-          inter_nranks, 1,
-          platform::errors::PreconditionNotMet(
-              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
-              inter_nranks));
-      PADDLE_ENFORCE_EQ(
-          trainers.size() % inter_nranks, 0,
-          platform::errors::PreconditionNotMet(
-              "The number of trainers %llu mod inter_nranks %d is not equal 0",
-              trainers.size(), inter_nranks));
-
-      inter_trainer_id = trainer_id % inter_nranks;
-
-      if (trainer_id % inter_nranks == 0) {
-        exter_trainer_id = trainer_id / inter_nranks;
-      }
-    }
-
-    if (trainer_id != 0) {
-      GetIdByServer(endpoint, &local_scope, dev_ctx, nccl_comm_num,
-                    use_hierarchical_allreduce, trainer_id, inter_trainer_id,
-                    exter_trainer_id);
-    }
-
-    std::ostringstream ss;
-    for (size_t i = 0; i < trainers.size(); i++) {
-      ss << trainers[i] << ",";
-    }
-
-    VLOG(1) << "trainer_id:" << trainer_id
-            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
-            << ", inter_nranks:" << inter_nranks
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << ", trainers:" << ss.str();
-
-    // init flat
-    if (trainer_id == 0) {
-      std::vector<std::string> flat_endpoints;
-      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
-                            trainers.end());
-      // flat nccl_id
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string var_name = platform::GetFlatNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, var_name, flat_endpoints);
-      }
-    }
-
-    if (!use_hierarchical_allreduce) {
-      return;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        trainers.size() % inter_nranks, 0,
-        platform::errors::PreconditionNotMet(
-            "The number of trainers %llu mod inter_nranks %d is not equal 0",
-            trainers.size(), inter_nranks));
-    PADDLE_ENFORCE_GT(
-        inter_nranks, 1,
-        platform::errors::PreconditionNotMet(
-            "inter_nranks %d <= 1 while in hierarchical allreduce mode",
-            inter_nranks));
-
-    // hierarchical inter ncclid
-    if (inter_trainer_id == 0) {
-      std::ostringstream ss;
-      ss << endpoint;
-      std::vector<std::string> inter_endpoints;
-      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
-                                   i < static_cast<int>(trainers.size());
-           i++) {
-        ss << ",";
-        inter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalInterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, inter_endpoints);
-      }
-    }
-
-    // hierarchical exter ncclid
-    if (exter_trainer_id == 0) {
-      std::ostringstream ss;
-      std::vector<std::string> exter_endpoints;
-      ss << endpoint;
-      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
-        ss << ",";
-        exter_endpoints.push_back(trainers[i]);
-        ss << trainers[i];
-      }
-      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
-      for (int i = 0; i < nccl_comm_num; i++) {
-        std::string nccl_var_name =
-            platform::GetHierarchicalExterNCCLVarName(i);
-        GenerateAndSend(&local_scope, dev_ctx, nccl_var_name, exter_endpoints);
-      }
-    }
-  }
-
- private:
-  void GenerateAndSend(framework::Scope* scope,
-                       const platform::DeviceContext& dev_ctx,
-                       const std::string& nccl_id_name,
-                       const std::vector<std::string>& endpoint_list) const {
-    auto var = scope->FindVar(nccl_id_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::NotFound("Variable with name %s is not found",
-                                        nccl_id_name.c_str()));
-    auto id = var->GetMutable<ncclUniqueId>();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetUniqueId(id));
-
-    distributed::RPCClient* client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-    for (auto& ep : endpoint_list) {
-      VLOG(3) << "sending nccl_id_var:" << nccl_id_name << " to " << ep;
-      client->AsyncSendVar(ep, dev_ctx, *scope, nccl_id_name);
-    }
-    client->Wait();
-    for (auto& ep : endpoint_list) {
-      client->AsyncSendBatchBarrier(ep);
-    }
-    client->Wait();
-    VLOG(3) << "sending completed...";
-  }
-
-  void GetIdByServer(const std::string& endpoint, framework::Scope* scope,
-                     const platform::DeviceContext& dev_ctx, int nccl_comm_num,
-                     bool use_hierarchical_allreduce, int trainer_id,
-                     int inter_trainer_id, int exter_trainer_id) const {
-    // std::string endpoint = Attr<std::string>("endpoint");
-    // NOTE: Can not use unique_ptr here because the default
-    // deleter will call GRPC Server's base class's dtor and
-    // that will cause a wired crash.
-    distributed::RequestSendHandler rpc_h(distributed::DistributedMode::kSync);
-    std::unique_ptr<distributed::RPCServer> rpc_service(
-        new RPCSERVER_T(endpoint, 1));
-
-    rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h);
-    rpc_h.SetRPCServer(rpc_service.get());
-
-    framework::ProgramDesc empty_program;
-    framework::Executor executor(dev_ctx.GetPlace());
-    rpc_h.SetScope(scope);
-    rpc_h.SetDevCtx(&dev_ctx);
-    rpc_h.SetProgram(&empty_program);
-    rpc_h.SetExecutor(&executor);
-
-    std::thread server_thread(
-        std::bind(&distributed::RPCServer::StartServer, rpc_service.get()));
-
-    for (int i = 0; i < nccl_comm_num; i++) {
-      rpc_service->SetCond(distributed::kRequestSend);
-      VLOG(3) << "trainer_id:" << trainer_id
-              << " start getting nccl id from trainer 0, nccl_comm_no:" << i;
-      rpc_service->WaitBarrier(distributed::kRequestSend);
-      rpc_service->ResetBarrierCounter();
-    }
-
-    if (use_hierarchical_allreduce) {
-      if (inter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3) << "trainer_id:" << trainer_id
-                  << ", inter_trainer_id:" << inter_trainer_id
-                  << " start getting nccl id from inter_trainer:" << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-
-      if (exter_trainer_id > 0) {
-        for (int i = 0; i < nccl_comm_num; i++) {
-          rpc_service->SetCond(distributed::kRequestSend);
-          VLOG(3)
-              << "trainer_id:" << trainer_id
-              << ", exter_trainer_id:" << exter_trainer_id
-              << " start getting nccl id from exter_trainer 0, nccl_comm_no:"
-              << i;
-          rpc_service->WaitBarrier(distributed::kRequestSend);
-          rpc_service->ResetBarrierCounter();
-        }
-      }
-    }
-
-    VLOG(3) << "traier_id:" << trainer_id
-            << ", inter_trainer_id:" << inter_trainer_id
-            << ", exter_trainer_id:" << exter_trainer_id
-            << " got nccl id and stop server...";
-    rpc_service->ShutDown();
-    VLOG(3) << "rpc server stopped";
-    server_thread.join();
-  }
-};
-
-class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput("NCCLID", "Raw variable contains a NCCL UniqueId instaces.");
-    AddComment(R"DOC(
-GenNCCLId operator
-
-For trainer 0: generate a new UniqueId and send it to all the other trainers.
-For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
-)DOC");
-    AddAttr<std::vector<std::string>>(
-        "trainers",
-        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
-        "list of all trainer endpoints")
-        .SetDefault({});
-    AddAttr<int>("trainer_id",
-                 "(int) "
-                 "The index of the trainer in distributed training.");
-    AddAttr<int>("nccl_comm_num",
-                 "(int default 1) "
-                 "The number of nccl communicator num.")
-        .SetDefault(1);
-    AddAttr<bool>("use_hierarchical_allreduce",
-                  "(bool default false) "
-                  "Wheter to use hierarchical allreduce.")
-        .SetDefault(false);
-    AddAttr<int>("hierarchical_allreduce_inter_nranks",
-                 "(int default 1) "
-                 "Wheter to use hierarchical allreduce.")
-        .SetDefault(-1);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(gen_nccl_id, ops::GenNCCLIdOp, ops::GenNCCLIdOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
deleted file mode 100644
index 43de8488a0e4ac24f7e261671488cd88563a805d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ /dev/null
@@ -1,636 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdio.h>  // for removing the port file
-#include <csignal>
-#include <cstdlib>
-#include <fstream>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gflags/gflags.h"
-
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
-#include "paddle/fluid/operators/distributed/heart_beat_monitor.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
-DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get");
-DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch");
-
-namespace paddle {
-namespace operators {
-
-void RunServer(std::shared_ptr<distributed::RPCServer> service) {
-  service->StartServer();
-  VLOG(4) << "RunServer thread end";
-}
-
-static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-static void ParallelExecuteBlocks(
-    const std::vector<size_t> &parallel_blkids, framework::Executor *executor,
-    const std::vector<std::shared_ptr<framework::ExecutorPrepareContext>>
-        &prepared,
-    framework::ProgramDesc *program, framework::Scope *scope) {
-  std::vector<std::future<void>> fs;
-  for (size_t idx : parallel_blkids) {
-    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
-      int run_block = idx;  // thread local
-      try {
-        VLOG(3) << "running server block: " << run_block
-                << "pointer: " << prepared[run_block].get();
-        executor->RunPreparedContext(prepared[run_block].get(), scope);
-      } catch (const std::exception &e) {
-        PADDLE_THROW(platform::errors::Fatal(
-            "Run %d-th sub program failed. The exception is:\n%s.", idx,
-            e.what()));
-      }
-    }));
-  }
-  for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
-}
-
-ListenAndServOp::ListenAndServOp(const std::string &type,
-                                 const framework::VariableNameMap &inputs,
-                                 const framework::VariableNameMap &outputs,
-                                 const framework::AttributeMap &attrs)
-    : OperatorBase(type, inputs, outputs, attrs) {}
-
-ListenAndServOp::~ListenAndServOp() { Stop(); }
-
-void ListenAndServOp::Stop() {
-  rpc_service_->ShutDown();
-  server_thread_->join();
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-}
-
-void ListenAndServOp::SavePort() const {
-  // NOTE: default write file to /tmp/paddle.selected_port
-  rpc_service_->SavePort();
-}
-
-static int64_t GetTimestamp() {
-  struct timeval tp;
-  gettimeofday(&tp, NULL);
-  return tp.tv_sec * 1000 + tp.tv_usec / 1000;
-}
-
-// For sync, sparse variables need recover grad type from LodTensor to
-// SelectedRows
-void ResetSparseVarsType(framework::Scope *recv_scope) {
-  auto *ins = distributed::LargeScaleKV::GetInstance();
-  auto grads = ins->GetAllGrads();
-
-  for (auto &grad : grads) {
-    auto *v = recv_scope->FindVar(grad);
-    v->Clear();
-    v->GetMutable<framework::SelectedRows>();
-  }
-}
-
-void ListenAndServOp::RunSyncLoop(
-    framework::Executor *executor, framework::ProgramDesc *program,
-    framework::Scope *recv_scope, platform::DeviceContext *dev_ctx,
-    const std::vector<int> &prefetch_block_id_list,
-    const int checkpoint_point_block_id) const {
-  VLOG(2) << "RunSyncLoop";
-  size_t num_blocks = program->Size();
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    platform::errors::PreconditionNotMet(
-                        "Invalid number of blocks in server program. Expected "
-                        "equal or greater than 2. Recieved %zu",
-                        num_blocks));
-
-  // Prepare all the server block
-  std::vector<int> optimize_blocks_list;
-  for (size_t i = 1; i < program->Size(); ++i) {
-    optimize_blocks_list.push_back(i);
-  }
-  auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list);
-  // Insert placeholder for block0 which holds current op itself,
-  // NOTE the first block in `optimize_prepared` should never be ran.
-  optimize_prepared.insert(
-      optimize_prepared.begin(),
-      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
-
-  // Trainers will get all parameters from pserver in the
-  // startup program, so we will wait RequestGet first
-  rpc_service_->SetCond(distributed::kRequestGet);
-  rpc_service_->WaitBarrier(distributed::kRequestGet);
-  rpc_service_->ResetBarrierCounter();
-
-  while (true) {
-    // Get from multiple trainers, we don't care about the order in which
-    // the gradients arrives, just add suffix 0~n and merge the gradient.
-    VLOG(3) << "wait all clients to send gradient";
-    rpc_service_->SetCond(distributed::kRequestSend);
-    VLOG(3) << "wait all clients to send send_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestSend);
-
-    if (rpc_service_->IsExit()) {
-      LOG(WARNING) << "get exit!rpc_processor break!";
-      rpc_service_->SetCond(distributed::kRequestGet);
-      break;
-    }
-
-    // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads
-    // and this will still work.
-    // The optimize blocks which have the same parent ID would run parallel
-    // TODO(Yancey1989): need to use ParallelExecutor for future
-    int32_t last_parent_blkid = optimize_blocks[0]->Parent();
-    std::vector<size_t> parallel_blkids;
-    parallel_blkids.push_back(optimize_blocks[0]->ID());
-    double ts = GetTimestamp();
-    for (size_t i = 1; i < optimize_blocks.size(); ++i) {
-      // skip the first optimize block because it is already in the
-      // parallel_blkids.
-      int blkid = optimize_blocks[i]->ID();
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared,
-                              program, recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
-      }
-      parallel_blkids.push_back(blkid);
-    }
-    ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program,
-                          recv_scope);
-    VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)";
-
-    VLOG(3) << "ResetReceivedVars";
-    ResetReceivedVars(recv_scope, dev_ctx, rpc_service_->NeedResetAllVars());
-    ResetSparseVarsType(recv_scope);
-
-    VLOG(3) << "wait all clients to get parameters back";
-    rpc_service_->SetCond(distributed::kRequestGet);
-    VLOG(3) << "wait all clients to send fetch_barrier";
-    rpc_service_->WaitBarrier(distributed::kRequestGet);
-    VLOG(3) << "ResetBarrierCounter";
-    rpc_service_->ResetBarrierCounter();
-  }  // while(true)
-}
-
-void ListenAndServOp::ResetReceivedVars(framework::Scope *recv_scope,
-                                        platform::DeviceContext *dev_ctx,
-                                        bool reset_all) const {
-  for (auto &varname : sparse_vars_) {
-    auto var = recv_scope->FindVar(varname);
-    if (var == nullptr) {
-      VLOG(2) << "can not find var " << varname << " in received scope";
-      continue;
-    }
-    if (var->IsType<framework::SelectedRows>()) {
-      VLOG(3) << "reset sparse var: " << varname;
-      var->GetMutable<framework::SelectedRows>()->mutable_rows()->clear();
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "The type of sparse var should be SelectedRows"));
-    }
-  }
-  if (UNLIKELY(reset_all)) {
-    for (auto &varname : dense_vars_) {
-      auto var = recv_scope->FindVar(varname);
-      if (var == nullptr) {
-        VLOG(2) << "can not find var " << varname << " in received scope";
-        continue;
-      }
-      if (var->IsType<framework::LoDTensor>()) {
-        math::set_constant(*dev_ctx, var->GetMutable<framework::LoDTensor>(),
-                           static_cast<float>(0));
-      } else if (var->IsType<framework::Tensor>()) {
-        math::set_constant(*dev_ctx, var->GetMutable<framework::Tensor>(),
-                           static_cast<float>(0));
-      } else {
-        PADDLE_THROW(platform::errors::PreconditionNotMet(
-            "The type of dense var should be in [LoDTensor, Tensor]"));
-      }
-    }
-  }
-}
-
-void ListenAndServOp::RunAsyncLoop(framework::Executor *executor,
-                                   framework::ProgramDesc *program,
-                                   framework::Scope *recv_scope) const {
-  VLOG(2) << "RunAsyncLoop";
-  auto grad_to_block_id_str =
-      Attr<std::vector<std::string>>("grad_to_block_id");
-  DoubleFindMap<std::string, int32_t> grad_to_block_id;
-
-  auto append_block_maps = [](DoubleFindMap<std::string, int32_t> *out_map,
-                              const std::string &grad_and_id) {
-    std::vector<std::string> pieces;
-    split(grad_and_id, ':', &pieces);
-    VLOG(3) << "after split, key = " << pieces[0] << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(pieces.size(), 2,
-                      platform::errors::PreconditionNotMet(
-                          "Invalid format of grad_and_id argument. "
-                          "Expected \"grad:block_id\". Recieved %s",
-                          grad_and_id.c_str()));
-    PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
-                      platform::errors::AlreadyExists(
-                          "The gradient name %s has already existed in out_map",
-                          pieces[0].c_str()));
-
-    int block_id = std::stoi(pieces[1]);
-    (*out_map)[pieces[0]] = block_id;
-  };
-
-  for (const auto &grad_and_id : grad_to_block_id_str) {
-    append_block_maps(&grad_to_block_id, grad_and_id);
-  }
-
-  size_t num_blocks = program->Size();
-  PADDLE_ENFORCE_GE(num_blocks, 2,
-                    platform::errors::PreconditionNotMet(
-                        "Invalid number of blocks in server program. Expected "
-                        "equal or greater than 2. Recieved %zu",
-                        num_blocks));
-  std::vector<int> block_list;
-  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
-  }
-  auto optimize_prepared = executor->Prepare(*program, block_list);
-  // execute global block if needed, block id 1 in the program is global
-  // block if it's not bind to a grad var for it's update.
-  if (block_list[0] == 1 &&
-      grad_to_block_id.find_value(static_cast<int32_t>(1)) ==
-          grad_to_block_id.end()) {
-    executor->RunPreparedContext(optimize_prepared[0].get(), recv_scope);
-  }
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      grad_to_prepared_ctx, param_to_prepared_ctx;
-  for (size_t i = 0; i < block_list.size(); ++i) {
-    auto blkid = block_list[i];
-    auto it = grad_to_block_id.find_value(blkid);
-    if (it != grad_to_block_id.end()) {
-      grad_to_prepared_ctx[it->first] = optimize_prepared[i];
-    }
-  }
-
-  request_send_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_get_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_prefetch_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-  request_send_and_recv_handler_->SetGradToPreparedCtx(&grad_to_prepared_ctx);
-
-  while (true) {
-    if (rpc_service_->IsExit()) {
-      VLOG(4) << "get exit!rpc_processor break!";
-      break;
-    }
-
-    sleep(1);
-  }  // while(true)
-}
-
-static void FillRequestCtx(
-    distributed::RequestHandler *h, framework::Scope *scope,
-    platform::DeviceContext *dev_ctx, framework::Executor *executor,
-    framework::ProgramDesc *program,
-    std::unordered_map<std::string,
-                       std::shared_ptr<framework::ExecutorPrepareContext>>
-        *prefetch_ctx,
-    std::unordered_map<std::string, std::string>
-        *sparse_grad_name_to_param_name,
-    std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
-    std::shared_ptr<framework::ExecutorPrepareContext> lr_decay_ctx,
-    distributed::RPCServer *rpc_server) {
-  h->SetScope(scope);
-  h->SetDevCtx(dev_ctx);
-  h->SetExecutor(executor);
-  h->SetProgram(program);
-  h->SetPrefetchPreparedCtx(prefetch_ctx);
-  h->SetSparseGradToParam(sparse_grad_name_to_param_name);
-  h->SetRPCServer(rpc_server);
-  h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
-  h->SetLrDecayPreparedCtx(lr_decay_ctx);
-}
-
-void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
-                                    const framework::Scope &scope) const {
-  for (const auto &varname : varnames) {
-    auto var = scope.FindVar(varname);
-    PADDLE_ENFORCE_NOT_NULL(
-        var, platform::errors::PreconditionNotMet(
-                 "Received var is not initialized in the received scope."));
-    if (var->IsType<framework::SelectedRows>()) {
-      sparse_vars_.push_back(varname);
-    } else if (var->IsType<framework::LoDTensor>() ||
-               var->IsType<framework::Tensor>()) {
-      dense_vars_.push_back(varname);
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "The type of received var should be in [SelectedRows, LoDTensor, "
-          "Tensor]."));
-    }
-  }
-}
-
-void ListenAndServOp::RunImpl(const framework::Scope &scope,
-                              const platform::Place &dev_place) const {
-  // Mark this as PS that it should decide profiling by listening from trainer.
-  platform::SetProfileListener();
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto &dev_ctx = *pool.Get(dev_place);
-  framework::Scope &recv_scope = scope.NewScope();
-
-  int distributed_mode = Attr<int>("distributed_mode");
-  bool dc_sgd = Attr<bool>("dc_asgd");
-  auto fan_in = Attr<int>("Fanin");
-  auto pserver_id = Attr<int>("pserver_id");
-  auto inputs = Inputs("X");
-
-  PADDLE_ENFORCE_EQ(rpc_service_, nullptr,
-                    platform::errors::PreconditionNotMet(
-                        "RPC service has been created unexpectedly."));
-  std::string endpoint = Attr<std::string>("endpoint");
-  int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
-  int lr_decay_block_id = Attr<int>(kLRDecayBlockId);
-
-  VLOG(4) << "pserver_id: " << pserver_id
-          << ", distributed_mode:" << distributed_mode << ", fan_in:" << fan_in
-          << ", end_point:" << endpoint
-          << ", checkpoint_block_id: " << checkpoint_block_id
-          << ", lr_decay_block_id: " << lr_decay_block_id;
-
-  rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in));
-
-  auto rpc_get_thread_num = Attr<int>("rpc_get_thread_num");
-  auto rpc_send_thread_num = Attr<int>("rpc_send_thread_num");
-  auto rpc_prefetch_thread_num = Attr<int>("rpc_prefetch_thread_num");
-
-  request_send_handler_.reset(
-      new distributed::RequestSendHandler(distributed_mode, dc_sgd));
-  request_get_handler_.reset(
-      new distributed::RequestGetHandler(distributed_mode, dc_sgd));
-  request_prefetch_handler_.reset(
-      new distributed::RequestPrefetchHandler(distributed_mode));
-  request_checkpoint_handler_.reset(
-      new distributed::RequestCheckpointHandler(distributed_mode));
-  request_get_no_barrier_handler_.reset(
-      new distributed::RequestGetNoBarrierHandler());
-  request_notify_handler_.reset(
-      new distributed::RequestNotifyHandler(distributed_mode, fan_in));
-  request_send_and_recv_handler_.reset(
-      new distributed::RequestSendAndRecvHandler(distributed_mode));
-
-  rpc_service_->RegisterRPC(distributed::kRequestSend,
-                            request_send_handler_.get(), rpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestGet,
-                            request_get_handler_.get(), rpc_get_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestPrefetch,
-                            request_prefetch_handler_.get(),
-                            rpc_prefetch_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestCheckpoint,
-                            request_checkpoint_handler_.get());
-  rpc_service_->RegisterRPC(distributed::kRequestGetNoBarrier,
-                            request_get_no_barrier_handler_.get());
-  rpc_service_->RegisterRPC(distributed::kRequestNotify,
-                            request_notify_handler_.get(), rpc_send_thread_num);
-  rpc_service_->RegisterRPC(distributed::kRequestSendAndRecv,
-                            request_send_and_recv_handler_.get(),
-                            rpc_get_thread_num);
-
-  auto optimize_blocks =
-      Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
-  PADDLE_ENFORCE_GE(optimize_blocks.size(), 1,
-                    platform::errors::PreconditionNotMet(
-                        "optimize blocks is less than 1. Optimize blocks "
-                        "should be 1 at least on the pserver side."));
-  auto *program = optimize_blocks[0]->Program();
-
-  framework::Executor executor(dev_place);
-
-  std::shared_ptr<framework::ExecutorPrepareContext> ckpt_pre_context = nullptr;
-  if (checkpoint_block_id != -1) {
-    auto ctx = executor.Prepare(*program, checkpoint_block_id);
-    // see: https://stackoverflow.com/a/14856553
-    ckpt_pre_context = std::move(ctx);
-  }
-
-  std::shared_ptr<framework::ExecutorPrepareContext> lr_decay_context = nullptr;
-  if (lr_decay_block_id != -1) {
-    auto ctx = executor.Prepare(*program, lr_decay_block_id);
-    // see: https://stackoverflow.com/a/14856553
-    lr_decay_context = std::move(ctx);
-  }
-
-  // prepare for prefetch
-  std::vector<int> prefetch_block_id_list;
-  std::unordered_map<int, std::string> block_id_to_prefetch_var_name;
-
-  auto prefetch_var_name_to_block_id_str =
-      Attr<std::vector<std::string>>(kPrefetchVarNameToBlockId);
-  for (const auto &prefetch_var_name_and_id :
-       prefetch_var_name_to_block_id_str) {
-    std::vector<std::string> pieces;
-    split(prefetch_var_name_and_id, ':', &pieces);
-    VLOG(3) << "after split, prefetch_var = " << pieces[0]
-            << ", id=" << pieces[1];
-    PADDLE_ENFORCE_EQ(
-        pieces.size(), 2,
-        platform::errors::PreconditionNotMet(
-            "Invalid format of prefetch_var_name_and_id argument. "
-            "Expected \"xxx:xxx\". Recieved %s",
-            prefetch_var_name_and_id.c_str()));
-
-    int block_id = std::stoi(pieces[1]);
-    prefetch_block_id_list.push_back(block_id);
-    block_id_to_prefetch_var_name[block_id] = pieces[0];
-  }
-
-  auto prefetch_prepared = executor.Prepare(*program, prefetch_block_id_list);
-
-  std::unordered_map<std::string,
-                     std::shared_ptr<framework::ExecutorPrepareContext>>
-      prefetch_var_name_to_prepared_ctx;
-  for (size_t i = 0; i < prefetch_block_id_list.size(); ++i) {
-    auto block_id = prefetch_block_id_list[i];
-    auto prefetch_var_name = block_id_to_prefetch_var_name[block_id];
-    prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
-  }
-
-  // parse attr of kSparseGradToParam  sparse_grad_name -> param_name
-  std::unordered_map<std::string, std::string> sparse_grad_name_to_param_name;
-  auto sparse_grad_name_to_param_name_str =
-      Attr<std::vector<std::string>>(kSparseGradToParam);
-  for (const auto &sparse_grad_name_and_param_name :
-       sparse_grad_name_to_param_name_str) {
-    std::vector<std::string> pieces;
-    split(sparse_grad_name_and_param_name, ':', &pieces);
-    PADDLE_ENFORCE_EQ(
-        pieces.size(), 2,
-        platform::errors::PreconditionNotMet(
-            "Invalid format of sparse_grad_name_and_param_name argument. "
-            "Expected \"xxx:xxx\". Recieved %s",
-            sparse_grad_name_and_param_name.c_str()));
-    VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
-            << ", param_name = " << pieces[1];
-    sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
-  }
-
-  auto f =
-      std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx,
-                &executor, program, &prefetch_var_name_to_prepared_ctx,
-                &sparse_grad_name_to_param_name, ckpt_pre_context,
-                lr_decay_context, rpc_service_.get());
-
-  f(request_send_handler_.get());
-  f(request_get_handler_.get());
-  f(request_prefetch_handler_.get());
-  f(request_checkpoint_handler_.get());
-  f(request_get_no_barrier_handler_.get());
-  f(request_notify_handler_.get());
-  f(request_send_and_recv_handler_.get());
-
-  // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers
-  signal(SIGINT, SignalHandler::StopAndExit);
-  signal(SIGTERM, SignalHandler::StopAndExit);
-
-  if (distributed_mode == distributed::DistributedMode::kSync) {
-    // start the server listening after all member initialized.
-    server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    VLOG(3) << "wait server thread to become ready...";
-    rpc_service_->WaitServerReady();
-
-    CacheVarsType(inputs, recv_scope);
-
-    // Write to a file of server selected port for python use.
-    SavePort();
-
-    RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
-                prefetch_block_id_list, checkpoint_block_id);
-  } else {
-    if (distributed_mode == distributed::DistributedMode::kGeo) {
-      distributed::AsyncSparseParamUpdateRecorder::Init(
-          fan_in, sparse_grad_name_to_param_name);
-    }
-
-    VLOG(2) << "RunAsyncLoop";
-    auto grad_to_block_id_str =
-        Attr<std::vector<std::string>>("grad_to_block_id");
-
-    if (grad_to_block_id_str.size() == 0) {
-      VLOG(0) << "there are no gradients on this parameter server";
-    } else {
-      std::vector<std::string> pieces;
-      split(grad_to_block_id_str[0], ':', &pieces);
-      distributed::HeartBeatMonitor::Init(fan_in, pserver_id == 0, pieces[0]);
-    }
-
-    // start the server listening after all member initialized.
-    server_thread_.reset(new std::thread(RunServer, rpc_service_));
-    VLOG(3) << "wait server thread to become ready...";
-    rpc_service_->WaitServerReady();
-
-    // Write to a file of server selected port for python use.
-    SavePort();
-
-    RunAsyncLoop(&executor, program, &recv_scope);
-  }
-}
-
-class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable();
-    AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" +
-" will start a RPC server which can receive variables from send_op and send" +
-"back variables to recv_op.)DOC");
-    AddAttr<std::string>("endpoint",
-                         "(string, default 127.0.0.1:6164)"
-                         "IP address to listen on.")
-        .SetDefault("127.0.0.1:6164")
-        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
-    AddAttr<int>("pserver_id",
-                 "(int, default -1), the parameter server index id")
-        .SetDefault(-1);
-    AddAttr<std::vector<std::string>>(
-        "grad_to_block_id",
-        "['param1@GRAD.block0:1', 'param2@GRAD.blockn:2'] "
-        "a map from grad name to it's optimize block id")
-        .SetDefault({});
-    AddAttr<int>("distributed_mode",
-                 "indicate distriubte training mode, 0 is sync, 1 is "
-                 "fully-async, 2 is half-async, 3 is geo")
-        .SetDefault(0);
-    AddAttr<bool>("dc_asgd", "set to true will enable DC-ASGD training.")
-        .SetDefault(false);
-    AddAttr<std::vector<framework::BlockDesc *>>(
-        kOptimizeBlocks, "Optimize blocks to run on server side.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
-                                      "prefetch blocks to run on server side.")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        kSparseGradToParam,
-        "sparse grad name to param name. like: 'emb@Grad:emb'")
-        .SetDefault({});
-    AddAttr<int>("Fanin", "How many clients send to this server.")
-        .SetDefault(1);
-    AddAttr<int>(kCheckpointBlockId,
-                 "BolckID to run save checkpoint on pserer.")
-        .SetDefault(-1);
-    AddAttr<int>(kLRDecayBlockId, "BolckID to run lr decay on pserer.")
-        .SetDefault(-1);
-    AddAttr<int>("rpc_get_thread_num", "pserver get thread num.").SetDefault(1);
-    AddAttr<int>("rpc_send_thread_num", "pserver send thread num.")
-        .SetDefault(1);
-    AddAttr<int>("rpc_prefetch_thread_num", "pserver prefetch thread num.")
-        .SetDefault(1);
-  }
-};
-
-void SignalHandler::StopAndExit(int signal_num) {
-  // Do not use VLOG here for the device for printing maybe already released.
-  // exit will release interal allocated resoureces.
-  auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid());
-  remove(file_path.c_str());
-  exit(0);
-}
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(listen_and_serv, ops::ListenAndServOp,
-                  ops::ListenAndServOpMaker);
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
deleted file mode 100644
index bacfd32cc739193a18317cb7d57c16953a4b5c84..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <atomic>
-#include <memory>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/threadpool.h"
-#include "paddle/fluid/operators/distributed/request_handler.h"
-#include "paddle/fluid/operators/distributed/rpc_server.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace framework {
-class Executor;
-class ProgramDesc;
-class Scope;
-}  // namespace framework
-namespace platform {
-class DeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCServer;
-class RequestHandler;
-}  // namespace distributed
-
-constexpr char kOptimizeBlocks[] = "optimize_blocks";
-constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
-constexpr char kCheckpointBlockId[] = "checkpint_block_id";
-constexpr char kLRDecayBlockId[] = "lr_decay_block_id";
-constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
-
-void RunServer(std::shared_ptr<distributed::RPCServer> service);
-
-template <class TKey, class TValue>
-class DoubleFindMap : public std::unordered_map<TKey, TValue> {
- public:
-  typename std::unordered_map<TKey, TValue>::iterator find_value(TValue v) {
-    return std::find_if(this->begin(), this->end(),
-                        [&v](const std::pair<const std::string, int> p) {
-                          return p.second == v;
-                        });
-  }
-};
-
-class ListenAndServOp : public framework::OperatorBase {
- public:
-  ListenAndServOp(const std::string& type,
-                  const framework::VariableNameMap& inputs,
-                  const framework::VariableNameMap& outputs,
-                  const framework::AttributeMap& attrs);
-  virtual ~ListenAndServOp();
-
-  void RunSyncLoop(framework::Executor* executor,
-                   framework::ProgramDesc* program,
-                   framework::Scope* recv_scope,
-                   platform::DeviceContext* dev_ctx,
-                   const std::vector<int>& prefetch_block_id_list,
-                   const int checkpoint_point_block_id) const;
-
-  void RunAsyncLoop(framework::Executor* executor,
-                    framework::ProgramDesc* program,
-                    framework::Scope* recv_scope) const;
-
-  void SavePort() const;
-
-  int GetSelectedPort() { return rpc_service_->GetSelectedPort(); }
-
-  void Stop() override;
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& dev_place) const override;
-
-  void ResetReceivedVars(framework::Scope* recv_scope,
-                         platform::DeviceContext* dev_ctx,
-                         bool reset_all = false) const;
-
-  void CacheVarsType(const std::vector<std::string>& varnames,
-                     const framework::Scope& scope) const;
-
- protected:
-  mutable std::shared_ptr<distributed::RPCServer> rpc_service_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_send_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_get_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_get_no_barrier_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_prefetch_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_checkpoint_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler> request_notify_handler_;
-  mutable std::shared_ptr<distributed::RequestHandler>
-      request_send_and_recv_handler_;
-
-  mutable std::shared_ptr<std::thread> server_thread_;
-  mutable std::vector<std::string> sparse_vars_;
-  mutable std::vector<std::string> dense_vars_;
-};
-
-class SignalHandler {
- public:
-  static void StopAndExit(int signal_num);
-
- private:
-  DISABLE_COPY_AND_ASSIGN(SignalHandler);
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
deleted file mode 100644
index b8328b88da7d12141bf4ed7974af8fca6321a1a9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h"
-
-#include <string>
-namespace paddle {
-namespace operators {
-
-class LargeScaleFuseAdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Grad"),
-        platform::errors::InvalidArgument(
-            "Input(Grad) of LargeScaleFuseAdamOp should not be null."));
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        platform::errors::InvalidArgument(
-            "Input(LearningRate) of LargeScaleFuseAdamOp should not be null."));
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 element"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
-    if (var_name == "LearningRate") {
-      return framework::OpKernelType(tensor.type(), tensor.place(),
-                                     tensor.layout());
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class LargeScaleFuseAdamOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto in_var_type = ctx->GetInputType("Grad");
-    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
-                          in_var_type == framework::proto::VarType::LOD_TENSOR,
-                      true, platform::errors::InvalidArgument(
-                                "The input Var's type should be LoDtensor or "
-                                "SelectedRows, but the received type is %s",
-                                in_var_type));
-  }
-};
-
-class LargeScaleFuseAdamOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Grad",
-             "(SelectedRows) Ids's type should be SelectedRows"
-             "THe ids to be looked up in W.");
-
-    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
-    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
-    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
-    AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
-
-    AddAttr<float>("beta1",
-                   "(float, default 0.9) "
-                   "Exponential decay rate for the "
-                   "first moment estimates.")
-        .SetDefault(0.9f);
-
-    AddAttr<float>("beta2",
-                   "(float, default 0.999) "
-                   "exponential decay rate for the "
-                   "second moment estimates.")
-        .SetDefault(0.999f);
-
-    AddAttr<float>("epsilon",
-                   "(float, default 1.0e-8) "
-                   "Constant for numerical stability")
-        .SetDefault(1.0e-8f);
-
-    AddAttr<bool>("is_entry",
-                  "(bool)"
-                  "sparse table need entry");
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-
-    AddComment(R"DOC(
-Adam Optimizer.
-
-This implements the Adam optimizer from Section 2 of the Adam
-paper : https://arxiv.org/abs/1412.6980.
-Adam is a first-order gradient-based optimization method based on
-adaptive estimates of lower-order moments.
-
-Adam updates:
-
-$$
-moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
-moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
-learning\_rate = learning\_rate *
-                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
-param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
-$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_fuse_adam, ops::LargeScaleFuseAdamOp,
-    ops::LargeScaleFuseAdamOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::LargeScaleFuseAdamOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_fuse_adam,
-    ops::LargeScaleFuseAdamOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
deleted file mode 100644
index 89b8d54a463b03076c9489b842540ea4a4f68a82..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <math.h>  // for sqrt in CPU and CUDA
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LargeScaleFuseAdamOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename T>
-class LargeScaleFuseAdamOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    using paddle::framework::LoDTensor;
-
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *grad_var = ctx.InputVar("Grad");
-
-    PADDLE_ENFORCE(
-        grad_var->IsType<framework::SelectedRows>(),
-        platform::errors::InvalidArgument(
-            "in large scale optimize, gradient should only be SelectedRows"));
-
-    const auto &grad = grad_var->Get<framework::SelectedRows>();
-
-    // for distributed training, a sparse var may be empty,
-    // just skip updating.
-    if (grad.rows().size() == 0) {
-      return;
-    }
-
-    framework::SelectedRows tmp_grad_merge;
-    const framework::SelectedRows *grad_merge_ptr;
-    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
-    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
-               &tmp_grad_merge, true);
-    grad_merge_ptr = &tmp_grad_merge;
-
-    std::vector<int64_t> in_rows;
-    in_rows.reserve(grad_merge_ptr->rows().size());
-    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
-              std::back_inserter(in_rows));
-
-    const auto *lr = learning_rate->data<T>();
-    auto grad_v = grad_merge_ptr->value();
-    auto grad_width = grad_v.dims()[1];
-
-    //    auto is_entry = context.Attr<bool>("is_entry");
-    auto tablename = ctx.Attr<std::string>("tablename");
-    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
-
-    auto *beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
-    auto *beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
-    auto *beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
-    auto *beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-
-    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
-                      platform::errors::InvalidArgument(
-                          "beta1 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta1_pow_out->numel()));
-
-    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
-                      platform::errors::InvalidArgument(
-                          "beta2 pow output size should be 1, but received "
-                          "value is:%d.",
-                          beta2_pow_out->numel()));
-
-    // update beta1 and beta2
-    beta1_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-        beta1 * beta1_pow->data<T>()[0];
-    beta2_pow_out->mutable_data<T>(ctx.GetPlace())[0] =
-        beta2 * beta2_pow->data<T>()[0];
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    std::vector<int64_t> dims;
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    auto *table = ins->Get(tablename);
-    table->Get(in_rows, value_names, &values);
-    table->Dims({"Param"}, &dims);
-
-    PADDLE_ENFORCE_EQ(dims[0], grad_width,
-                      platform::errors::InvalidArgument(
-                          "param_row should have the same size with grad_row"));
-
-    T lr_ = lr[0];
-    T beta1_pow_ = beta1_pow->data<T>()[0];
-    T beta2_pow_ = beta2_pow->data<T>()[0];
-
-    lr_ *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
-
-    for (size_t i = 0; i < in_rows.size(); i++) {
-      auto &params = values[i][0];
-      auto &moment_1 = values[i][1];
-      auto &moment_2 = values[i][2];
-
-      auto *p_data = params->data();
-      auto *m1_data = moment_1->data();
-      auto *m2_data = moment_2->data();
-
-      for (int x = 0; x < grad_width; ++x) {
-        auto g = grad_v.data<T>()[grad_width * i + x];
-        m1_data[x] = beta1 * m1_data[x] + (1 - beta1) * g;
-        m2_data[x] = beta2 * m2_data[x] + (1 - beta2) * g * g;
-        p_data[x] -= lr_ * (m1_data[x] / (sqrt(m2_data[x]) + epsilon));
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
deleted file mode 100644
index 8794b87f3ff40786712d6a1d06bb0dc2e0607671..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h"
-
-#include <string>
-namespace paddle {
-namespace operators {
-
-class LargeScaleFuseSGDOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("Grad"),
-        platform::errors::InvalidArgument(
-            "Input(Grad) of LargeScaleFuseSGDOp should not be null."));
-    PADDLE_ENFORCE(
-        ctx->HasInput("LearningRate"),
-        platform::errors::InvalidArgument(
-            "Input(LearningRate) of LargeScaleFuseSGDOp should not be null."));
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-
-    PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
-                      platform::errors::InvalidArgument(
-                          "Maybe the Input variable LearningRate has not "
-                          "been initialized. You may need to confirm "
-                          "if you put exe.run(startup_program) "
-                          "after optimizer.minimize function."));
-
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      platform::errors::InvalidArgument(
-                          "Learning rate should have 1 element"));
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Grad");
-    return framework::OpKernelType(data_type, ctx.device_context());
-  }
-
-  framework::OpKernelType GetKernelTypeForVar(
-      const std::string &var_name, const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
-    if (var_name == "LearningRate") {
-      return framework::OpKernelType(tensor.type(), tensor.place(),
-                                     tensor.layout());
-    }
-    return framework::OpKernelType(expected_kernel_type.data_type_,
-                                   tensor.place(), tensor.layout());
-  }
-};
-
-class LargeScaleFuseSGDOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto in_var_type = ctx->GetInputType("Grad");
-    PADDLE_ENFORCE_EQ(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
-                          in_var_type == framework::proto::VarType::LOD_TENSOR,
-                      true, platform::errors::InvalidArgument(
-                                "The input Var's type should be LoDtensor or "
-                                "SelectedRows, but the received type is %s",
-                                in_var_type));
-  }
-};
-
-class LargeScaleFuseSGDOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Grad",
-             "(SelectedRows) Ids's type should be SelectedRows"
-             "THe ids to be looked up in W.");
-    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
-    AddAttr<bool>("is_entry",
-                  "(bool)"
-                  "sparse table need entry");
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-
-    AddComment(R"DOC(
-
-LargeScaleFuseSGD operator
-
-This operator implements one step of the stochastic gradient descent algorithm.
-
-$$param\_out = param - learning\_rate * grad$$
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_fuse_sgd, ops::LargeScaleFuseSGDOp,
-    ops::LargeScaleFuseSGDOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::LargeScaleFuseSGDOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_fuse_sgd,
-    ops::LargeScaleFuseSGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
deleted file mode 100644
index 5d4bf1015fa3a8c2c8fb102fcd890f41b296269d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class LargeScaleFuseSGDOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override;
-};
-
-template <typename T>
-class LargeScaleFuseSGDOpKernel<platform::CPUDeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-
-    const auto *grad_var = ctx.InputVar("Grad");
-
-    PADDLE_ENFORCE(
-        grad_var->IsType<framework::SelectedRows>(),
-        platform::errors::InvalidArgument(
-            "in large scale optimize, gradient should only be SelectedRows"));
-
-    const auto &grad = grad_var->Get<framework::SelectedRows>();
-
-    // for distributed training, a sparse var may be empty,
-    // just skip updating.
-    if (grad.rows().size() == 0) {
-      return;
-    }
-
-    framework::SelectedRows tmp_grad_merge;
-    const framework::SelectedRows *grad_merge_ptr;
-    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
-    merge_func(ctx.template device_context<platform::CPUDeviceContext>(), grad,
-               &tmp_grad_merge, true);
-    grad_merge_ptr = &tmp_grad_merge;
-
-    std::vector<int64_t> in_rows;
-    in_rows.reserve(grad_merge_ptr->rows().size());
-    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
-              std::back_inserter(in_rows));
-
-    const auto *lr = learning_rate->data<T>();
-    auto grad_v = grad_merge_ptr->value();
-    auto grad_width = grad_v.dims()[1];
-
-    //    auto is_entry = context.Attr<bool>("is_entry");
-    auto tablename = ctx.Attr<std::string>("tablename");
-    auto value_names = ctx.Attr<std::vector<std::string>>("value_names");
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    std::vector<int64_t> dims;
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    auto *table = ins->Get(tablename);
-    table->Get(in_rows, value_names, &values);
-    table->Dims({"Param"}, &dims);
-
-    PADDLE_ENFORCE_EQ(dims[0], grad_width,
-                      platform::errors::InvalidArgument(
-                          "param_row should have the same size with grad_row"));
-
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-
-    std::vector<T> grads;
-    framework::TensorToVector(grad_v, ctx.device_context(), &grads);
-
-    blas.SCAL(grads.size(), lr[0], grads.data());
-
-    for (int x = 0; x < static_cast<int>(in_rows.size()); ++x) {
-      auto &params = values[x][0];
-      blas.VSUB(grad_width, params->data(), grads.data() + grad_width * x,
-                params->data());
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc
deleted file mode 100644
index 9ff2e78d8652d929bf0205009872379d5b14df19..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupSparseTableGradSplitOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-};
-
-class LookupSparseTableGradSplitOpMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Grad",
-             "(SelectedRows) Ids's type should be SelectedRows"
-             "THe ids to be looked up in W.");
-
-    AddAttr<bool>("is_entry",
-                  "(bool)"
-                  "sparse table need entry");
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddOutput("Row",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.");
-    AddOutput("Value",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.");
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_grad_split, ops::LookupSparseTableGradSplitOp,
-    ops::LookupSparseTableGradSplitOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_grad_split,
-    ops::LookupSparseTableGradSplitKernel<paddle::platform::CPUDeviceContext,
-                                          float>,
-    ops::LookupSparseTableGradSplitKernel<paddle::platform::CPUDeviceContext,
-                                          double>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h
deleted file mode 100644
index b3077efda6de3efaa004152b4f35ab6b618f1b1e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <iostream>
-#include <iterator>
-#include <random>
-#include <sstream>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using SelectedRows = framework::SelectedRows;
-
-template <typename DeviceContext, typename T>
-class LookupSparseTableGradSplitKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const SelectedRows* in_grad = context.Input<SelectedRows>("Grad");
-
-    // merge duplicated rows if any.
-    // The rows of grad_merge_ptr have been sorted inside MergeAdd functor
-    framework::SelectedRows tmp_grad_merge;
-    const framework::SelectedRows* grad_merge_ptr;
-    math::scatter::MergeAdd<DeviceContext, T> merge_func;
-    merge_func(context.template device_context<DeviceContext>(), *in_grad,
-               &tmp_grad_merge, true);
-    grad_merge_ptr = &tmp_grad_merge;
-
-    std::vector<int64_t> in_rows;
-    in_rows.reserve(grad_merge_ptr->rows().size());
-    std::copy(grad_merge_ptr->rows().begin(), grad_merge_ptr->rows().end(),
-              std::back_inserter(in_rows));
-
-    auto* out_row = context.Output<Tensor>("Row");
-    out_row->Resize(
-        framework::make_ddim({static_cast<int64_t>(in_rows.size()), 1}));
-    out_row->mutable_data<int64_t>(context.GetPlace());
-    framework::TensorFromVector(in_rows, context.device_context(), out_row);
-
-    auto in_value = grad_merge_ptr->value();
-    std::vector<T> ins_vector;
-    framework::TensorToVector(in_value, context.device_context(), &ins_vector);
-    auto dims = in_value.dims();
-
-    auto is_entry = context.Attr<bool>("is_entry");
-    auto tablename = context.Attr<std::string>("tablename");
-
-    if (is_entry) {
-      auto* ins = distributed::LargeScaleKV::GetInstance();
-      std::vector<int64_t> ids;
-      ins->Get(tablename)->GetEntry(in_rows, &ids);
-
-      for (auto& id : ids) {
-        auto it = std::find(in_rows.begin(), in_rows.end(), id);
-        if (it == in_rows.end()) {
-          PADDLE_THROW(platform::errors::OutOfRange(
-              "the input key should be exists. But received %d.", id));
-        }
-
-        auto distance =
-            static_cast<int64_t>(std::distance(in_rows.begin(), it));
-        std::fill(ins_vector.data() + distance * dims[1],
-                  ins_vector.data() + dims[1], 0.0);
-      }
-    }
-
-    auto* out_v = context.OutputVar("Value");
-    out_v->Clear();
-    auto* out_t = out_v->GetMutable<framework::LoDTensor>();
-    out_t->mutable_data<T>(context.GetPlace());
-    framework::TensorFromVector(ins_vector, context.device_context(), out_t);
-    out_t->Resize(dims);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc
deleted file mode 100644
index 96ec6a85d6eab5ccc24d0c3a2a0e120810c4015d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-// examples: embedding:Param,Moment1,Moment2:64,64,64:0
-constexpr char kLargeScaleKV[] = "large_scale_metas";
-constexpr int64_t kNoPadding = -1;
-
-static void split(const std::string &str, char sep,
-                  std::vector<std::string> *pieces) {
-  pieces->clear();
-  if (str.empty()) {
-    return;
-  }
-  size_t pos = 0;
-  size_t next = str.find(sep, pos);
-  while (next != std::string::npos) {
-    pieces->push_back(str.substr(pos, next - pos));
-    pos = next + 1;
-    next = str.find(sep, pos);
-  }
-  if (!str.substr(pos).empty()) {
-    pieces->push_back(str.substr(pos));
-  }
-}
-
-class LookupSparseTableInitInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-void InitLargeScaleKV(std::vector<std::string> kv_attrs) {
-  std::vector<distributed::SparseMeta> metas;
-
-  for (auto attrs : kv_attrs) {
-    std::vector<std::string> pieces;
-    split(attrs, ':', &pieces);
-    PADDLE_ENFORCE_EQ(
-        pieces.size(), 8,
-        platform::errors::InvalidArgument(
-            "param, names, dims, mode, grad, cached_var, init_attrs"));
-
-    std::string name;
-    std::string grad_name;
-    std::vector<std::string> value_names;
-    std::vector<int> value_dims;
-    distributed::Mode mode;
-    std::vector<std::string> cached_names;
-    std::vector<std::string> init_attrs;
-    std::string entry_attr;
-
-    name = pieces[0];
-    split(pieces[1], ',', &value_names);
-
-    std::vector<std::string> value_dims_str;
-    split(pieces[2], ',', &value_dims_str);
-    for (auto &str : value_dims_str) {
-      value_dims.push_back(std::stoi(str));
-    }
-
-    mode = pieces[3] == "0" ? distributed::Mode::training
-                            : distributed::Mode::infer;
-
-    grad_name = pieces[4];
-    split(pieces[5], ',', &cached_names);
-    split(pieces[6], ',', &init_attrs);
-    entry_attr = pieces[7];
-
-    auto meta = distributed::SparseMeta();
-    meta.name = name;
-    meta.value_names = value_names;
-    meta.value_dims = value_dims;
-    meta.mode = mode;
-    meta.grad_name = grad_name;
-    meta.cached_varnames = cached_names;
-    meta.initializer_attrs = init_attrs;
-    meta.entry = entry_attr;
-
-    VLOG(3) << "add sparse meta: " << meta.ToString();
-    metas.push_back(meta);
-  }
-
-  distributed::LargeScaleKV::Init(metas);
-  VLOG(3) << "init large scale kv with " << metas.size() << " params";
-}
-
-class LookupSparseTableInitOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto kv_attrs = Attr<std::vector<std::string>>(kLargeScaleKV);
-    InitLargeScaleKV(kv_attrs);
-  }
-};
-
-class LookupSparseTableInitOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddAttr<std::vector<std::string>>(kLargeScaleKV,
-                                      "(string)"
-                                      "sparse table name");
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_init, ops::LookupSparseTableInitOp,
-    ops::LookupSparseTableInitInferShape, ops::LookupSparseTableInitOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc
deleted file mode 100644
index 79dc206f040cc5e1bcefb006f10de510eb53270f..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h"
-
-namespace paddle {
-namespace operators {
-
-class LookupSparseTableMergeOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInputs("X"), true,
-        platform::errors::InvalidArgument("Input(X) should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::InvalidArgument("Output(Out) should not be null."));
-
-    PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X").front(),
-                      framework::proto::VarType::SELECTED_ROWS,
-                      platform::errors::InvalidArgument(
-                          "Input X only should be SelectedRows."));
-    PADDLE_ENFORCE_EQ(ctx->GetOutputsVarType("Out").front(),
-                      framework::proto::VarType::SELECTED_ROWS,
-                      platform::errors::InvalidArgument(
-                          "Output Y only should be SelectedRows."));
-
-    ctx->ShareDim("X", /*->*/ "Out");
-  }
-};
-
-class LookupSparseTableMergeMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "The input type is SelectedRows, and the selected rows may be "
-             "duplicated.")
-        .AsDuplicable();
-    AddOutput("Out",
-              "The output type is SelectedRows, and the selected rows are not "
-              "duplicated.");
-    AddComment(
-        R"DOC(
-Merge sparse lookup table(selected rows as parameter).
-)DOC");
-  }
-};
-
-class LookupSparseTableMergeOpInferVarType
-    : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
-      const override {
-    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
-    return m;
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OPERATOR(lookup_sparse_table_merge, ops::LookupSparseTableMergeOp,
-                  ops::LookupSparseTableMergeMaker,
-                  ops::LookupSparseTableMergeOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    lookup_sparse_table_merge,
-    ops::LookupSparseTableMergeKernel<plat::CPUDeviceContext, float>,
-    ops::LookupSparseTableMergeKernel<plat::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h
deleted file mode 100644
index 0efd5cada1c93e129da1b608046d355693fad6fd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *     http://www.apache.org/licenses/LICENSE-2.0
- *     Unless required by applicable law or agreed to in writing, software
- *     distributed under the License is distributed on an "AS IS" BASIS,
- *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *     See the License for the specific language governing permissions and
- *     limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-int64_t GetDelimiterForShard(const std::vector<int64_t>& rows, int start_idx,
-                             int shard_id, int shard_num) {
-  int64_t rows_num = rows.size() / 2;
-  for (int64_t i = start_idx; i < rows_num; ++i) {
-    if (rows[i] % shard_num != shard_id) {
-      return i;
-    }
-  }
-  return rows_num;
-}
-
-template <typename DeviceContext, typename T>
-class LookupSparseTableMergeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto inputs = ctx.MultiInput<framework::SelectedRows>("X");
-    auto* out = ctx.Output<framework::SelectedRows>("Out");
-
-    int64_t height = 0;
-    int64_t ids_num = 0;
-    int64_t width = 0;
-
-    height = inputs[0]->height();
-    width = inputs[0]->value().dims()[1];
-
-    for (auto& in : inputs) {
-      ids_num += in->rows().size();
-      height += in->height();
-    }
-
-    T* out_data = out->mutable_value()->mutable_data<T>({ids_num, width},
-                                                        platform::CPUPlace());
-
-    out->set_height(height);
-    std::vector<int64_t> all_ids;
-    all_ids.reserve(ids_num);
-    for (auto& in : inputs) {
-      all_ids.insert(all_ids.end(), in->rows().begin(), in->rows().end());
-    }
-    out->set_rows(all_ids);
-
-    int64_t cnt = 0;
-
-    for (auto& in : inputs) {
-      auto rows = in->rows().size();
-      const T* in_data = in->value().data<T>();
-      std::copy_n(in_data, rows * width, out_data + cnt);
-      cnt += rows * width;
-    }
-    out->SyncIndex();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc
deleted file mode 100644
index 87a37c5bfdefaae36d4f28549af7cd92d52d3584..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int64_t kNoPadding = -1;
-
-class LookupSparseTableReadInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class LookupSparseTableReadOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto init = Attr<bool>("init");
-
-    auto &id_tensor = scope.FindVar(Input("Ids"))->Get<framework::LoDTensor>();
-    auto *id_data = id_tensor.data<int64_t>();
-    auto tablename = Attr<std::string>("tablename");
-    auto value_names = Attr<std::vector<std::string>>("value_names");
-    auto out_names = Outputs("Out");
-
-    std::vector<int64_t> ids;
-    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
-      ids.push_back(id_data[i]);
-    }
-
-    std::vector<std::vector<std::vector<float> *>> values;
-    std::vector<int64_t> dims;
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-
-    if (init) {
-      ins->Get(tablename)->Init(ids);
-      ins->Get(tablename)->Get(ids, value_names, &values);
-    } else {
-      ins->Get(tablename)->Get(ids, value_names, &values);
-    }
-
-    ins->Get(tablename)->Dims(value_names, &dims);
-
-    platform::CPUPlace cpu;
-    std::vector<float *> tensors;
-
-    for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
-      auto out_var = scope.FindVar(out_names[i]);
-      auto out_t = out_var->GetMutable<framework::LoDTensor>();
-
-      std::vector<int64_t> o_dims;
-      o_dims.push_back(static_cast<int64_t>(ids.size()));
-      o_dims.push_back(dims[i]);
-      out_t->Resize(framework::make_ddim(o_dims));
-      auto *out_d = out_t->mutable_data<float>(cpu);
-      tensors.push_back(out_d);
-    }
-
-    for (int i = 0; i < static_cast<int>(values.size()); i++) {
-      for (int j = 0; j < static_cast<int>(tensors.size()); j++) {
-        std::memcpy(tensors[j] + i * dims[j], values[i][j]->data(),
-                    sizeof(float) * dims[j]);
-      }
-    }
-  }
-};
-
-class LookupSparseTableReadOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.");
-    AddOutput("Out",
-              "(LoDTensor) The lookup results, which have the "
-              "same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-
-    AddAttr<bool>("init", " for test init large scale kv").SetDefault(false);
-
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_read, ops::LookupSparseTableReadOp,
-    ops::LookupSparseTableReadInferShape, ops::LookupSparseTableReadOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc b/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc
deleted file mode 100644
index afe79cd1c316c637a1d2f63c8284683e6e10393c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/large_scale_kv.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-constexpr int64_t kNoPadding = -1;
-
-class LookupSparseTableWriteInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-class LookupSparseTableWriteOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &id_tensor = scope.FindVar(Input("Ids"))->Get<framework::LoDTensor>();
-    auto *id_data = id_tensor.data<int64_t>();
-
-    std::vector<int64_t> ids;
-    for (int64_t i = 0; i < id_tensor.numel(); ++i) {
-      ids.push_back(id_data[i]);
-    }
-
-    auto tablename = Attr<std::string>("tablename");
-    auto value_names = Attr<std::vector<std::string>>("value_names");
-
-    std::vector<const float *> tensors;
-    std::vector<int64_t> dims;
-    std::vector<std::vector<std::vector<float>>> values;
-    values.resize(ids.size());
-
-    auto in_names = Inputs("In");
-    for (int i = 0; i < static_cast<int>(in_names.size()); i++) {
-      auto *in = scope.FindVar(in_names[i]);
-      auto in_t = in->Get<framework::LoDTensor>();
-      dims.push_back(in_t.dims()[1]);
-      tensors.push_back(in_t.data<float>());
-    }
-
-    for (int i = 0; i < static_cast<int>(ids.size()); i++) {
-      values[i].resize(tensors.size());
-      for (int j = 0; j < static_cast<int>(tensors.size()); j++) {
-        values[i][j].resize(dims[j]);
-        std::memcpy(values[i][j].data(), tensors[j] + i * dims[j],
-                    sizeof(float) * dims[j]);
-      }
-    }
-
-    auto *ins = distributed::LargeScaleKV::GetInstance();
-    ins->Get(tablename)->Set(ids, value_names, values);
-  }
-};
-
-class LookupSparseTableWriteOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids",
-             "(LoDTensor) Ids's type should be LoDTensor"
-             "THe ids to be looked up in W.");
-    AddInput("In",
-             "(LoDTensor) The lookup results, which have the "
-             "same type as W.")
-        .AsDuplicable();
-
-    AddAttr<std::string>("tablename",
-                         "(string)"
-                         "sparse table name");
-    AddAttr<std::vector<std::string>>("value_names",
-                                      "(strings)"
-                                      "sparse table name");
-    AddComment(R"DOC(
-Lookup Sprase Tablel Operator.
-
-This operator is used to perform lookup on parameter W,
-then concatenated into a sparse tensor.
-
-The type of Ids(Input) is SelectedRows, the rows of Ids contains
-the ids to be looked up in W;
-if the Id is not in the sparse table, this operator will return a
-random value and set the value into the table for the next looking up.
-
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    lookup_sparse_table_write, ops::LookupSparseTableWriteOp,
-    ops::LookupSparseTableWriteInferShape, ops::LookupSparseTableWriteOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
deleted file mode 100644
index 33a433b0dbe04bcfc62b18fbbac3cc902a69defd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/merge_ids_op.h"
-
-namespace paddle {
-namespace operators {
-
-class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
-        .AsDuplicable();
-    AddInput("Rows", "(LoDTensor) the input ids with shape{row_size, 1}, ")
-        .AsDuplicable();
-    AddInput("X",
-             "(LoDTensors) multi input tensor with shape{Rows, N}, N is the "
-             "size of embedding table")
-        .AsDuplicable();
-    AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.")
-        .AsDuplicable();
-
-    AddComment(R"DOC(
-Merge multi LoDTensor's into one according to Ids's shard num.
-
-
-split_ids_op -> prefetch_op -> merge_ids_op
-
-
-merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op
- will split input Ids into multiple tensors according to Id's shard number.
-prefetch_op will send them to parameter server to prefetch embedding value
-back. During split, the order of ids is disordered. In merge_ids_op we use
-the original Ids to restore the order of the fetched embedding value and
- also pass the lod information to the merged output.
-
-
-Example:
-
-    Ids = [1,2,3,4,5,6] # 3 shared
-
-split_ids_op ->
-
-    Id0 = [3, 6] # id % 3 == 0
-    Id1 = [1, 4] # id % 3 == 1
-    Id2 = [2, 5] # id % 3 == 2
-
-prefetch_op ->
-
-    X0 = [[0.3 0.3]   # 3
-          [0.6 0.6]]  # 6
-    X1 = [[0.1 0.1]   # 1
-          [0.4 0.4]]  # 4
-    X2 = [[0.2 0.2]   # 2
-          [0.5 0.5]]  # 5
-
-merge_ids_op ->
-
-    Out = [[0.1 0.1]  # 1
-           [0.2 0.2]  # 2
-           [0.3 0.3]  # 3
-           [0.4 0.4]  # 4
-           [0.5 0.5]  # 5
-           [0.6 0.6]] # 6
-)DOC");
-  }
-};
-
-class MergeIdsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("Ids"), "Input", "Ids", "MergeIds");
-    OP_INOUT_CHECK(ctx->HasInputs("Rows"), "Input", "Rows", "MergeIds");
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "MergeIds");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "MergeIds");
-
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          ids_dims[0].size(), 2,
-          platform::errors::InvalidArgument(
-              "the ids size must be 2, but received %d", ids_dims[0].size()));
-      PADDLE_ENFORCE_EQ(
-          ids_dims[0][1], 1,
-          platform::errors::InvalidArgument(
-              "the ids dim must be 1, but received %d", ids_dims[0][1]));
-    }
-    auto x_var_type = ctx->GetInputsVarType("X");
-    for (auto &var_type : x_var_type) {
-      PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR,
-                        platform::errors::InvalidArgument(
-                            "input X only support lod tensors"));
-    }
-    ctx->ShareLoD("Ids", "Out");
-  }
-
- private:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class MergeIdsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetInputType("Ids");
-    ctx->SetOutputType("Out", input_type, framework::ALL_ELEMENTS);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker,
-                  ops::MergeIdsOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    merge_ids, ops::MergeIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.h b/paddle/fluid/operators/distributed_ops/merge_ids_op.h
deleted file mode 100644
index 9af014f57a68751484d29adc7e425542225f437e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class MergeIdsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    if (!platform::is_cpu_place(place)) {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "MergeIds do not support GPU kernel"));
-    }
-
-    const auto ids = ctx.MultiInput<framework::LoDTensor>("Ids");
-    const auto row_ids = ctx.MultiInput<framework::LoDTensor>("Rows");
-    const auto x_tensors = ctx.MultiInput<framework::LoDTensor>("X");
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-
-    PADDLE_ENFORCE_EQ(row_ids.size(), x_tensors.size(),
-                      platform::errors::InvalidArgument(
-                          "the number of Rows and X should be the same"));
-    PADDLE_ENFORCE_EQ(ids.size(), outs.size(),
-                      platform::errors::InvalidArgument(
-                          "the number of Ids and Out should be the same"));
-
-    int64_t row_ids_size = 0;
-    int64_t row_size = 0;
-    int64_t embedding_size = 0;
-
-    for (size_t i = 0; i < x_tensors.size(); ++i) {
-      const auto *x_tensor = x_tensors[i];
-      const auto *row_id = row_ids[i];
-
-      if (embedding_size == 0) {
-        embedding_size = x_tensor->dims()[1];
-      }
-      PADDLE_ENFORCE_EQ(embedding_size, x_tensor->dims()[1],
-                        platform::errors::InvalidArgument(
-                            "embedding size of all input should be the same"));
-      row_size += x_tensor->dims()[0];
-      row_ids_size += row_id->dims()[0];
-    }
-
-    PADDLE_ENFORCE_EQ(
-        row_size, row_ids_size,
-        platform::errors::InvalidArgument(
-            "the merged X dim[0] and merged Rows dim[0] should be the same"));
-
-    std::unordered_map<int64_t, std::tuple<int64_t, int64_t>>
-        selected_rows_idx_map;
-    for (size_t i = 0; i < x_tensors.size(); ++i) {
-      const auto *row_id = row_ids[i];
-
-      for (auto j = 0; j < row_id->numel(); ++j) {
-        int64_t key = row_id->data<int64_t>()[j];
-        std::tuple<int64_t, int64_t> val = std::make_tuple(i, j);
-        selected_rows_idx_map.insert(std::make_pair(key, val));
-      }
-    }
-    PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(),
-                      platform::errors::InvalidArgument(
-                          "the rows and tensor map size should be the same"));
-
-    for (size_t i = 0; i < outs.size(); ++i) {
-      auto *out_ids = ids[i];
-      auto *out = outs[i];
-
-      out->set_lod(out_ids->lod());
-
-      auto nums = out_ids->dims()[0];
-      auto *out_data = out->mutable_data<T>(
-          framework::make_ddim({nums, embedding_size}), place);
-      for (auto j = 0; j < nums; ++j) {
-        auto id = out_ids->data<int64_t>()[j];
-        auto row_tuple = selected_rows_idx_map.at(id);
-        auto row_idx = std::get<1>(row_tuple);
-        const auto *x_tensor = x_tensors[std::get<0>(row_tuple)];
-
-        memcpy(out_data + embedding_size * j,
-               x_tensor->data<T>() + row_idx * embedding_size,
-               sizeof(T) * embedding_size);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/prefetch_op.cc b/paddle/fluid/operators/distributed_ops/prefetch_op.cc
deleted file mode 100644
index 007dbbbfbf5105b052598c9454dfcdc15319d658..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/prefetch_op.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class PrefetchOp : public framework::OperatorBase {
- public:
-  PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-    auto outs = Outputs("Out");
-
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& ctx = *pool.Get(place);
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    std::vector<distributed::VarHandlePtr> rets;
-    for (size_t i = 0; i < ins.size(); i++) {
-      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
-                << outs[i] << " back";
-        rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, scope,
-                                                    ins[i], outs[i]));
-      } else {
-        VLOG(3) << "don't send no-initialied variable: " << ins[i];
-      }
-    }
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_EQ(
-          rets[i]->Wait(), true,
-          platform::errors::Fatal(
-              "It's a fatal error of RPCClient that RPCClient can't "
-              "get the wait result. It may happen when trainers or "
-              "parameter servers exit un normally or the network "
-              "issue!"));
-    }
-  }
-};
-
-class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable();
-    AddOutput("Out",
-              "(LoDTensor) result "
-              "to be fetched from parameter server")
-        .AsDuplicable();
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-    AddComment(R"DOC(
-Prefetch operator
-
-This operator will send Ids variables to listen_and_serve op at
-the parameter server and fetch result back.
-)DOC");
-  }
-};
-
-class PrefetchOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    prefetch, ops::PrefetchOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::PrefetchOpMaker, ops::PrefetchOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
deleted file mode 100644
index 9729d0dadd7ed8e68056e7c87437944425efdfe0..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class RecvOp : public framework::OperatorBase {
- public:
-  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &place) const override {
-    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    std::vector<std::string> varnames =
-        Attr<std::vector<std::string>>("varnames");
-
-    auto outs = Outputs("Out");
-    bool with_barrier = Attr<bool>("with_barrier");
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &ctx = *pool.Get(place);
-    auto trainer_id = Attr<int>("trainer_id");
-
-    distributed::RPCClient *rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-    std::vector<std::string> recv_varnames =
-        Attr<std::vector<std::string>>("recv_varnames");
-
-    if (recv_varnames.size() > 0) {
-      auto *communicator = distributed::Communicator::GetInstance();
-
-      if (communicator != nullptr) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "execute startup program must before fleet.init_worker"));
-      }
-    } else {
-      std::vector<distributed::VarHandlePtr> rets;
-      if (with_barrier) {
-        for (size_t i = 0; i < outs.size(); i++) {
-          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                  << varname << " and with AsyncGetVar";
-          rets.push_back(
-              rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i]));
-        }
-      } else {
-        for (size_t i = 0; i < outs.size(); i++) {
-          std::string varname = varnames.size() == 0 ? outs[i] : varnames[i];
-          VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with "
-                  << varname << " and with AsyncGetVarNoBarrier";
-          rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope,
-                                                          varname, outs[i]));
-        }
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_recv " << outs[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(
-            rets[i]->Wait(), 0U,
-            platform::errors::ExecutionTimeout("internal error in RPCClient"));
-        VLOG(7) << "after sync_recv " << outs[i] << "from " << epmap[i];
-      }
-    }
-  }
-};
-
-class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDuplicable();
-    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
-    AddComment(R"DOC(
-Recv operator
-
-This operator can get variables from server side.
-)DOC");
-    AddAttr<std::vector<std::string>>("epmap",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({});
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<bool>("with_barrier",
-                  "(bool, default True) if with_barrier=False, will use "
-                  "AsyncGetVarNoBarrier get variable from pserver immediately")
-        .SetDefault(true);
-    AddAttr<std::vector<std::string>>(
-        "varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-    AddAttr<std::vector<std::string>>(
-        "recv_varnames",
-        "(vector<string>) "
-        "the split parameter varnames to be recved from pserver")
-        .SetDefault(std::vector<std::string>{});
-    AddAttr<int>("do_not_run", "if recv need to really run").SetDefault(0);
-  }
-};
-
-class RecvOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    recv, ops::RecvOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::RecvOpMaker, ops::RecvOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc
deleted file mode 100644
index d194fcda36a474fa208f5d5a67e425ba5a5a3303..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <stdint.h>
-#include <fstream>
-#include <numeric>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_recv.h"
-#include "paddle/fluid/string/string_helper.h"
-
-namespace paddle {
-namespace operators {
-class RecvSaveOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
-        platform::CPUPlace());
-  }
-};
-
-class RecvSaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddComment(R"DOC(
-Recv Save operator
-
-This operator will serialize and write LoDTensor variable to file on disk.
-)DOC");
-    AddAttr<int>("dtype",
-                 "(int, default 5 (FP32)) "
-                 "Output data type")
-        .SetDefault(framework::proto::VarType::FP32);
-
-    AddAttr<bool>("overwrite",
-                  "(boolean, default true)"
-                  "Overwrite the output file if exist")
-        .SetDefault(true);
-
-    AddAttr<std::string>("file_path",
-                         "(string)"
-                         "The \"file_path\" where the variable will be saved.")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>(
-        "slice_varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>(
-        "remote_varnames",
-        "(string vector, default {}) "
-        "sometimes we need to put received var in another name "
-        "for example: we need var named 'moment_1@127.0.0.1:1001', "
-        "and it real name on parameter server is 'moment_1'. ")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>("slice_shapes",
-                                      "(vector<int>) "
-                                      "the length of each output along the "
-                                      "specified axis.")
-        .SetDefault({});
-
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({});
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<bool>("is_sparse", "sparse or dense param");
-    AddAttr<int>("pserver_num", "the number of pserver").SetDefault(0);
-    AddAttr<bool>("is_distributed", "sparse id range [0, N) or [0, INT64]")
-        .SetDefault(false);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class RecvSaveOpKernel : public framework::OpKernel<T> {
- private:
-  void SerializeVersionToStream(std::ostream &os) const {
-    {  // the 1st field, uint32_t version for LoDTensor
-      os.write(reinterpret_cast<const char *>(&framework::kCurTensorVersion),
-               sizeof(framework::kCurTensorVersion));
-    }
-    // the 2st field, LoD information
-    // in this scene, skip LoD information.
-    uint64_t size = 0;
-    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-  }
-
-  void SerializeTensorHeaderToStream(
-      std::ostream &os, const framework::proto::VarType::Type &type,
-      const framework::DDim &dims) const {
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      os.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-      // int32_t  size
-      // void*    protobuf message
-      framework::proto::VarType::TensorDesc desc;
-      desc.set_data_type(type);
-      auto tensor_dims = framework::vectorize(dims);
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(tensor_dims.size()), 0);
-      std::copy(tensor_dims.begin(), tensor_dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      os.write(out.data(), size);
-    }
-  }
-
-  void SerializeTensorAppendToStream(std::ostream &os,
-                                     const framework::Tensor &tensor) const {
-    uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
-    auto *data_ptr = tensor.data<void>();
-
-    PADDLE_ENFORCE_LT(size, std::numeric_limits<std::streamsize>::max(),
-                      platform::errors::ResourceExhausted(
-                          "tensor size %d overflow when writing tensor", size));
-    os.write(static_cast<const char *>(data_ptr),
-             static_cast<std::streamsize>(size));
-  }
-
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto filename = ctx.Attr<std::string>("file_path");
-    auto overwrite = ctx.Attr<bool>("overwrite");
-
-    if (FileExists(filename) && !overwrite) {
-      PADDLE_THROW(platform::errors::AlreadyExists(
-          "%s is existed, cannot save to it when overwrite=false", filename));
-    }
-
-    MkDirRecursively(DirName(filename).c_str());
-
-    auto origin_shape = ctx.Attr<std::vector<int64_t>>("shape");
-    auto slice_shapes = ctx.Attr<std::vector<std::string>>("slice_shapes");
-    auto slice_varnames = ctx.Attr<std::vector<std::string>>("slice_varnames");
-    auto remote_varnames =
-        ctx.Attr<std::vector<std::string>>("remote_varnames");
-    auto endpoints = ctx.Attr<std::vector<std::string>>("endpoints");
-
-    auto trainer_id = ctx.Attr<int>("trainer_id");
-    auto is_sparse = ctx.Attr<bool>("is_sparse");
-    auto pserver_num = ctx.Attr<int>("pserver_num");
-    // auto is_distributed = ctx.Attr<int>("is_distributed");
-
-    PADDLE_ENFORCE_EQ(slice_shapes.size(), slice_varnames.size(),
-                      platform::errors::InvalidArgument(
-                          "Expected attr len(slice_shapes) must be equal to "
-                          "len(slice_varnames)"));
-
-    PADDLE_ENFORCE_EQ(
-        slice_shapes.size(), endpoints.size(),
-        platform::errors::InvalidArgument(
-            "Expected attr len(slice_shapes) must be equal to len(endpoints)"));
-
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-
-    // it to save an output stream.
-    std::ofstream fout(filename, std::ios::binary);
-    PADDLE_ENFORCE_EQ(
-        static_cast<bool>(fout), true,
-        platform::errors::NotFound("Cannot open %s to write", filename));
-
-    SerializeVersionToStream(fout);
-    SerializeTensorHeaderToStream(fout, data_type,
-                                  framework::make_ddim(origin_shape));
-
-    framework::Scope &local_scope = ctx.scope().NewScope();
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto place = ctx.GetPlace();
-    auto &device_ctx = *pool.Get(place);
-
-    distributed::RPCClient *rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-    if (!is_sparse) {
-      for (size_t i = 0; i < slice_varnames.size(); i++) {
-        auto &varname = slice_varnames[i];
-        auto *var = local_scope.Var(varname);
-        auto *tensor = var->GetMutable<framework::LoDTensor>();
-
-        auto slice_string =
-            string::split_string<std::string>(slice_shapes[i], ",");
-        std::vector<int64_t> slice_shape;
-
-        for (auto &dim : slice_string) {
-          slice_shape.push_back(static_cast<int64_t>(std::stoull(dim)));
-        }
-
-        tensor->Resize(framework::make_ddim(slice_shape));
-
-        distributed::VarHandlePtr ret;
-
-        ret = rpc_client->AsyncGetVarNoBarrier(
-            endpoints[i], device_ctx, local_scope, remote_varnames[i], varname);
-
-        PADDLE_ENFORCE_NE(
-            ret->Wait(), 0U,
-            platform::errors::ExecutionTimeout(
-                "rpc error when communication with %s", endpoints[i]));
-
-        auto &c_tensor = var->Get<framework::LoDTensor>();
-
-        SerializeTensorAppendToStream(fout, c_tensor);
-        local_scope.EraseVars({varname});
-      }
-    } else {
-      PADDLE_ENFORCE_GT(
-          pserver_num, 0,
-          platform::errors::InvalidArgument(
-              "Expected attr len(pserver_num) must gather than 0"));
-
-      std::vector<std::string> varnames;
-      auto *var = local_scope.Var("tmp_for_sparse_merge");
-      auto *o_t = var->GetMutable<framework::LoDTensor>();
-      o_t->Resize(framework::make_ddim(origin_shape));
-      auto *out_d = o_t->mutable_data<float>(place);
-
-      varnames.push_back("tmp_for_sparse_merge");
-      for (size_t i = 0; i < slice_varnames.size(); i++) {
-        varnames.push_back(slice_varnames[i]);
-      }
-
-      std::vector<const float *> tensors;
-
-      for (size_t i = 0; i < slice_varnames.size(); i++) {
-        auto &varname = slice_varnames[i];
-        auto *local_var = local_scope.Var(varname);
-        auto *tensor = local_var->GetMutable<framework::LoDTensor>();
-
-        auto slice_string =
-            string::split_string<std::string>(slice_shapes[i], ",");
-        std::vector<int64_t> slice_shape;
-
-        for (auto &dim : slice_string) {
-          slice_shape.push_back(static_cast<int64_t>(std::stoull(dim)));
-        }
-
-        tensor->Resize(framework::make_ddim(slice_shape));
-
-        distributed::VarHandlePtr ret;
-
-        ret = rpc_client->AsyncGetVarNoBarrier(
-            endpoints[i], device_ctx, local_scope, remote_varnames[i], varname);
-
-        PADDLE_ENFORCE_NE(
-            ret->Wait(), 0U,
-            platform::errors::ExecutionTimeout(
-                "rpc error when communication with %s", endpoints[i]));
-
-        const auto *value =
-            local_var->Get<framework::LoDTensor>().data<float>();
-        tensors.push_back(value);
-      }
-
-      auto dims1 = origin_shape[1];
-      for (int j = 0; j < origin_shape[0]; ++j) {
-        auto id = j % pserver_num;
-        auto idx = j / pserver_num;
-        std::memcpy(out_d + j * dims1, tensors[id] + idx * dims1,
-                    sizeof(float) * dims1);
-      }
-
-      auto &c_tensor = var->Get<framework::LoDTensor>();
-      SerializeTensorAppendToStream(fout, c_tensor);
-
-      local_scope.EraseVars(varnames);
-    }
-
-    fout.close();
-    ctx.scope().DeleteScope(&local_scope);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(recv_save, ops::RecvSaveOp, ops::RecvSaveOpProtoMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    recv_save, ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RecvSaveOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
deleted file mode 100644
index 4727b3bb249de8a49b6955d581109e6eee3e3f8b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
-
-#include <string>
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-class RefByTrainerIdOp : public framework::OperatorWithKernel {
- public:
-  RefByTrainerIdOp(const std::string &type,
-                   const framework::VariableNameMap &inputs,
-                   const framework::VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs)
-      : OperatorWithKernel(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInputs("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of RefByTrainerIdOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("TrainerId"), true,
-        platform::errors::InvalidArgument(
-            "Input(TrainerId) of RefByTrainerIdOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::InvalidArgument(
-            "Output(Out) of RefByTrainerIdOp should not be null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->GetInputDim("TrainerId").size(), 1,
-        platform::errors::InvalidArgument("TrainerId should be a scalar."));
-    // Out's shape is determined at runtime.
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
-  }
-};
-
-class RefByTrainerIdOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor list.").AsDuplicable();
-    AddInput("TrainerId", "(Tensor) Scalar int, the trainer id runtime value.");
-    AddOutput("Out", "(Tensor) Return one tensor reference of X[trainer_id]");
-    AddComment(R"DOC(
-**RefByTrainerId operator**
-
-Return a reference of a tensor, using trainer_id as the index to find from the input.
-
-$$Out = X[TrainerId]$$
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(ref_by_trainer_id, ops::RefByTrainerIdOp,
-                             ops::RefByTrainerIdOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    ref_by_trainer_id,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::RefByTrainerIdKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
deleted file mode 100644
index 168cd51355de56c2e2a83ba73d7eb14f6ba6e533..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    ref_by_trainer_id,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            float>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            double>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            int>,
-    paddle::operators::RefByTrainerIdKernel<paddle::platform::CUDADeviceContext,
-                                            int64_t>);
diff --git a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h b/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
deleted file mode 100644
index c8c437c4965e741dac3dabc3984a9bd3545c8858..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class RefByTrainerIdKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto in_list = context.MultiInput<framework::Tensor>("X");
-    auto* trainer_id_t = context.Input<framework::Tensor>("TrainerId");
-    int64_t trainer_id = 0;
-    auto* trainer_id_data = trainer_id_t->data<int64_t>();
-    if (platform::is_gpu_place(context.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      auto stream = context.cuda_device_context().stream();
-      memory::Copy<>(platform::CPUPlace(), &trainer_id,
-                     BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                     trainer_id_data, sizeof(int64_t), stream);
-#endif
-    } else {
-      trainer_id = *trainer_id_data;
-    }
-    PADDLE_ENFORCE_LT((size_t)trainer_id, in_list.size(),
-                      platform::errors::InvalidArgument(
-                          "X' size must >= TrainerId: [%s], but received [%s]",
-                          trainer_id, in_list.size()));
-    out->mutable_data<T>(context.GetPlace());
-    framework::TensorCopy(*(in_list[trainer_id]), in_list[trainer_id]->place(),
-                          out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc b/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
deleted file mode 100644
index 00cdbe70ca47e6e0bba8294b3b81c804b096339c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/send_and_recv_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/blocking_queue.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/communicator.h"
-#include "paddle/fluid/operators/distributed/communicator_common.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/parameter_send.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SendAndRecvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto& scope = ctx.scope();
-    const auto& place = ctx.GetPlace();
-    auto send_var_name = ctx.Attr<std::string>("send_var_name");
-    auto recv_var_name = ctx.Attr<std::string>("recv_var_name");
-    auto epmap = ctx.Attr<std::string>("endpoint");
-    auto trainer_id = ctx.Attr<int>("trainer_id");
-
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    auto& context = *pool.Get(place);
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-    VLOG(3) << "SendAndRecvOp Send_var_name: " << send_var_name
-            << " Recv_var_name: " << recv_var_name;
-    distributed::VarHandlePtr rets = rpc_client->AsyncSendAndRecv(
-        epmap, context, scope, send_var_name, recv_var_name);
-    rets->Wait();
-  }
-};
-
-class SendAndRecvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {}
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(data_type, platform::CPUPlace());
-  }
-};
-
-class SendAndRecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "Tensor Input variable to be sent").AsDuplicable();
-    AddOutput("Out", "Tensor Output varibale to be recv").AsDuplicable();
-    AddAttr<std::string>("send_var_name", "Send Tensor's name")
-        .SetDefault(std::string(""));
-    AddAttr<std::string>("recv_var_name", "Recv Tensor's name")
-        .SetDefault(std::string(""));
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::string>("endpoint", "Server endpoint")
-        .SetDefault({"127.0.0.1:6164"});
-    AddComment(R"DOC(
-    SendAndRecv operator
-    This operator will send variables to listen_and_serve op at the parameter server.
-    And recv variable from parameter server of send variable's scope.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(send_and_recv, ops::SendAndRecvOp, ops::SendAndRecvOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    send_and_recv,
-    ops::SendAndRecvKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
deleted file mode 100644
index 5aa2ba26aa4d6eef73e52ea64041d57c24ec47c1..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class SendBarrierOp : public framework::OperatorBase {
- public:
-  SendBarrierOp(const std::string& type,
-                const framework::VariableNameMap& inputs,
-                const framework::VariableNameMap& outputs,
-                const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto is_half_async = Attr<bool>("half_async");
-
-    if (is_half_async) {
-      distributed::Communicator::GetInstance()->Barrier();
-      return;
-    }
-
-    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-
-    distributed::RPCClient* rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
-
-    VLOG(3) << "SendBarrierOp sync";
-
-    std::vector<distributed::VarHandlePtr> rets;
-
-    for (auto& ep : eps) {
-      VLOG(3) << "send barrier, ep: " << ep;
-      rets.push_back(rpc_client->AsyncSendBatchBarrier(ep));
-    }
-
-    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(
-          rets[i]->Wait(), 0U,
-          platform::errors::ExecutionTimeout("internal error in RPCClient"));
-    }
-  }
-};
-
-class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Any) Dummy inputs, used for control dependency")
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SendBarrier operator
-
-This operator will send a send barrier signal to list_and_serv op, so that
-the Parameter Server would knew all variables have been sent.
-)DOC");
-
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints to send variables to.")
-        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<bool>(
-        "half_async",
-        "(bool, default false)"
-        "half_async=True is for half_async mode, this will send signal "
-        "to HalfAsyncCommunicator Instance")
-        .SetDefault(false);
-  }
-};
-
-class SendBarrierOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    send_barrier, ops::SendBarrierOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::SendBarrierOpMaker, ops::SendBarrierOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
deleted file mode 100644
index a4192c18afae50033b40490e5a831a184caaa1b3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-
-namespace paddle {
-namespace framework {
-class InferShapeContext;
-class OpDesc;
-class Scope;
-template <typename T>
-class EmptyGradOpMaker;
-}  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-}  // namespace paddle
-
-namespace paddle {
-namespace operators {
-
-namespace distributed {
-class RPCClient;
-}  // namespace distributed
-
-class SendOp : public framework::OperatorBase {
- public:
-  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    auto ins = Inputs("X");
-
-    auto epmap = Attr<std::vector<std::string>>("endpoints");
-    auto trainer_id = Attr<int>("trainer_id");
-
-    auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
-    auto height_sections = Attr<std::vector<int64_t>>("sections");
-    auto use_send_handler = Attr<bool>("use_send_handler");
-
-    if (send_varnames.size() > 0) {
-      distributed::Communicator::GetInstance()->Send(ins, send_varnames, scope);
-    } else {
-      platform::DeviceContextPool& pool =
-          platform::DeviceContextPool::Instance();
-      auto& ctx = *pool.Get(place);
-
-      distributed::RPCClient* rpc_client =
-          distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
-
-      std::vector<distributed::VarHandlePtr> rets;
-      if (use_send_handler) {
-        for (size_t i = 0; i < ins.size(); i++) {
-          if (NeedSend(scope, ins[i])) {
-            VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
-            rets.push_back(
-                rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]));
-          } else {
-            VLOG(3) << "don't send no-initialied variable: " << ins[i];
-          }
-        }
-      } else {
-        for (size_t i = 0; i < ins.size(); i++) {
-          for (size_t j = 0; j < epmap.size(); j++) {
-            if (NeedSend(scope, ins[i])) {
-              VLOG(3) << "sending " << ins[i] << " to " << epmap[j];
-              rets.push_back(rpc_client->AsyncDistributeNotify(epmap[j], ctx,
-                                                               scope, ins[i]));
-            } else {
-              VLOG(3) << "don't send no-initialied variable: " << ins[i];
-            }
-          }
-        }
-      }
-      for (size_t i = 0; i < rets.size(); i++) {
-        VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
-        PADDLE_ENFORCE_NE(
-            rets[i]->Wait(), 0U,
-            platform::errors::ExecutionTimeout("internal error in RPCClient"));
-        VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
-      }
-    }
-  }
-};
-
-class SendOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
-        .AsDuplicable();
-    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
-        .AsDuplicable();
-    AddComment(R"DOC(
-Send operator
-
-This operator will send variables to listen_and_serve op at the parameter server.
-)DOC");
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
-    AddAttr<std::vector<std::string>>("endpoints",
-                                      "(string vector, default 127.0.0.1:6164)"
-                                      "Server endpoints in the order of input "
-                                      "variables for mapping")
-        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<std::vector<int64_t>>("sections",
-                                  "(vector<int>) "
-                                  "the length of each output along the "
-                                  "specified axis.")
-        .SetDefault(std::vector<int64_t>{});
-    AddAttr<std::vector<std::string>>(
-        "send_varnames",
-        "(vector<string>) "
-        "the split output varnames to send to pserver")
-        .SetDefault(std::vector<std::string>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-    AddAttr<bool>("merge_add",
-                  "(bool, default 0)"
-                  "merge method, true represent add, false represent average")
-        .SetDefault(false);
-    AddAttr<bool>(
-        "use_send_handler",
-        "(bool, default 1)"
-        "if it's true, use send handler, other wise, use notify handler")
-        .SetDefault(true);
-  }
-};
-
-class SendOpShapeInference : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    send, ops::SendOp,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
-    ops::SendOpMaker, ops::SendOpShapeInference);
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc b/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
deleted file mode 100644
index 1f8e05a471983746215a1e48bec2d4b176c516bd..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/send_recv_op_test.cc
+++ /dev/null
@@ -1,257 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/string/printf.h"
-
-USE_NO_KERNEL_OP(send);
-USE_NO_KERNEL_OP(listen_and_serv);
-USE_OP(sum);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-namespace d = paddle::operators::distributed
-
-    // global for simplicity.
-    std::unique_ptr<f::OperatorBase>
-        listen_and_serv_op;
-int selected_port;
-
-void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
-  p::CPUDeviceContext ctx(place);
-  for (int i = 0; i < 2; ++i) {
-    auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope->Var(var_name);
-    auto tensor = var->GetMutable<f::LoDTensor>();
-    tensor->Resize({10, 10});
-    float *expect = tensor->mutable_data<float>(place);
-    for (int64_t i = 0; i < tensor->numel(); ++i) {
-      expect[i] = static_cast<float>(i);
-    }
-  }
-
-  auto out_var = scope->Var("Out");
-  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
-  out_tensor->Resize({10, 10});
-  out_tensor->mutable_data<float>(place);  // allocate
-}
-
-void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
-  p::CPUDeviceContext ctx(place);
-  int64_t height = 10;
-  int64_t row_numel = 10;
-  m::SetConstant<p::CPUDeviceContext, float> set_one;
-  // init x0
-  std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope->Var("x0");
-  auto x0 = x0_var->GetMutable<f::SelectedRows>();
-  x0->set_rows(rows0);
-  x0->set_height(height);
-  auto x0_value = x0->mutable_value();
-  x0_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
-  set_one(ctx, x0_value, 1.0);
-
-  // init x1
-  std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope->Var("x1");
-  auto x1 = x1_var->GetMutable<f::SelectedRows>();
-  x1->set_rows(rows1);
-  x1->set_height(height);
-  auto x1_value = x1->mutable_value();
-  x1_value->mutable_data<float>(
-      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
-  set_one(ctx, x1_value, 1.0);
-
-  auto out_var = scope->Var("Out");
-  auto out = out_var->GetMutable<f::SelectedRows>();
-  auto out_value = out->mutable_value();
-  out->set_height(height);
-  out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-}
-
-void AddOp(const std::string &type, const f::VariableNameMap &inputs,
-           const f::VariableNameMap &outputs, f::AttributeMap attrs,
-           f::BlockDesc *block, bool is_sparse) {
-  // insert output
-  for (auto kv : outputs) {
-    for (auto v : kv.second) {
-      auto var = block->Var(v);
-      var->SetDataType(f::proto::VarType::FP32);
-      var->SetPersistable(true);
-      if (is_sparse) {
-        var->SetType(f::proto::VarType::SELECTED_ROWS);
-      }
-    }
-  }
-
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-void StartServerNet(bool is_sparse, std::atomic<bool> *initialized) {
-  f::Scope scope;
-  p::CPUPlace place;
-  VLOG(4) << "before init tensor";
-  if (is_sparse) {
-    InitSelectedRowsInScope(place, &scope);
-  } else {
-    InitTensorsInScope(place, &scope);
-  }
-  // sub program run in listen_and_serv_op, for simple test we use sum
-  f::ProgramDesc program;
-  const auto &root_block = program.Block(0);
-  std::vector<framework::BlockDesc *> optimize_blocks;
-  auto *optimize_block = program.AppendBlock(root_block);
-  optimize_blocks.push_back(optimize_block);
-
-  auto *prefetch_block = program.AppendBlock(root_block);
-  // X for server side tensors, RX for received tensors, must be of same shape.
-  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block,
-        is_sparse);
-  f::AttributeMap attrs;
-  attrs.insert({"endpoint", std::string("127.0.0.1:0")});
-  attrs.insert({"Fanin", 1});
-  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
-  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
-  attrs.insert({"optimize_blocks", optimize_blocks});
-  attrs.insert({"PrefetchBlock", prefetch_block});
-  attrs.insert({"grad_to_block_id", std::vector<std::string>({""})});
-  attrs.insert({"distributed_mode", d::DistributedMode::kSync});
-  VLOG(4) << "before init op";
-  listen_and_serv_op =
-      f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
-  *initialized = true;
-  listen_and_serv_op->Run(scope, place);
-  LOG(INFO) << "server exit";
-}
-
-TEST(SendRecvOp, CPUDense) {
-  std::atomic<bool> initialized{false};
-  std::thread server_thread(StartServerNet, false, &initialized);
-  while (!initialized) {
-  }
-
-  static_cast<paddle::operators::ListenAndServOp *>(listen_and_serv_op.get())
-      ->WaitServerReady();
-
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  InitTensorsInScope(place, &scope);
-  // create rpc client var
-  scope.Var("RPC_CLIENT_VAR");
-
-  f::AttributeMap attrs;
-  auto *listen_and_serv_op_ptr =
-      static_cast<paddle::operators::ListenAndServOp *>(
-          listen_and_serv_op.get());
-  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
-  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
-  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
-  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
-  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  const f::VariableNameMap &inputs = {{"X", {"x1"}}};
-  const f::VariableNameMap &outputs = {{"Out", {"Out"}}};
-
-  auto send_op = f::OpRegistry::CreateOp("send", inputs, outputs, attrs);
-  send_op->Run(scope, place);
-
-  auto in_var = scope.Var("x1");
-  auto tensor = in_var->GetMutable<f::LoDTensor>();
-  float *expected = tensor->data<float>();
-  auto out_var = scope.Var("Out");
-  auto target = out_var->GetMutable<f::LoDTensor>();
-  // x1 * 2 == x0
-  EXPECT_NE(target->memory_size(), size_t(0));
-  float *actual = target->data<float>();
-  for (int64_t i = 0; i < target->numel(); ++i) {
-    EXPECT_EQ(expected[i] * 2, actual[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset(nullptr);
-  paddle::operators::ListenAndServOp::ResetPort();
-}
-
-TEST(SendRecvOp, CPUSparse) {
-  std::atomic<bool> initialized;
-  initialized = false;
-  std::thread server_thread(StartServerNet, true, &initialized);
-  while (!initialized) {
-  }
-  auto *listen_and_serv_op_ptr =
-      static_cast<paddle::operators::ListenAndServOp *>(
-          listen_and_serv_op.get());
-  ASSERT_TRUE(listen_and_serv_op_ptr != nullptr);
-  listen_and_serv_op_ptr->WaitServerReady();
-
-  // local net
-  f::Scope scope;
-  p::CPUPlace place;
-  p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(place, &scope);
-  scope.Var("RPC_CLIENT_VAR");
-  f::AttributeMap attrs;
-  selected_port = listen_and_serv_op_ptr->GetSelectedPort();
-  std::string endpoint = paddle::string::Sprintf("127.0.0.1:%d", selected_port);
-  attrs.insert({"endpoints", std::vector<std::string>({endpoint})});
-  attrs.insert({"epmap", std::vector<std::string>({endpoint})});
-  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
-                                         {{"Out", {"Out"}}}, attrs);
-  send_op->Run(scope, place);
-
-  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
-  auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
-  auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
-  auto actual = out->mutable_value();
-
-  std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
-  auto expect_value = expect->mutable_value();
-  expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
-
-  m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
-  add_functor(ctx, *x0, *x1, expect.get());
-
-  EXPECT_EQ(actual->numel(), expect_value->numel());
-  EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
-
-  for (int64_t i = 0; i < expect_value->numel(); ++i) {
-    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
-              actual->mutable_data<float>(place)[i]);
-  }
-  listen_and_serv_op->Stop();
-  server_thread.join();
-  listen_and_serv_op.reset();
-  paddle::operators::ListenAndServOp::ResetPort();
-}
diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h
deleted file mode 100644
index 7dc0596ac31e2506ae02de11b33bd0532f02cc7a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-
-namespace paddle {
-namespace operators {
-
-inline bool NeedSend(const framework::Scope& scope,
-                     const std::string& varname) {
-  // dummy variable is only used in parallel executor to represent
-  // some dependency relationship, we don't need to send/recv it.
-  // TODO(paddle-dev): Why would parallel executor logic leaked into here?
-  if (varname.find(framework::ir::Node::kControlDepVarName) !=
-      std::string::npos)
-    return false;
-  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(
-      var, platform::errors::NotFound(
-               "Can not find variable '%s' in the send side.", varname));
-  if (var->IsType<framework::LoDTensor>()) {
-    return var->Get<framework::LoDTensor>().IsInitialized();
-  } else if (var->IsType<framework::SelectedRows>()) {
-    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Variable type in send side should be LodTensor or SelectedRows."));
-  }
-  return false;
-}
-
-inline std::vector<int64_t> ToAbsoluteSection(
-    const std::vector<int64_t>& height_sections) {
-  std::vector<int64_t> abs_sections;
-  abs_sections.resize(height_sections.size());
-  abs_sections[0] = 0;
-  for (size_t i = 1; i < height_sections.size(); ++i) {
-    abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
-  }
-  return abs_sections;
-}
-
-inline size_t GetSectionIndex(int64_t id,
-                              const std::vector<int64_t>& abs_sections) {
-  for (size_t i = 1; i < abs_sections.size(); ++i) {
-    if (id < abs_sections[i]) {
-      return i - 1;
-    }
-  }
-  return abs_sections.size() - 1;
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc b/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
deleted file mode 100644
index 6cd01089f9bc22a510094d0e994c487525fa35bc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <future>  // NOLINT
-#include <ostream>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/version.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = framework::SelectedRows;
-
-struct DeserializedDataFunctor {
-  DeserializedDataFunctor(void **buf, Tensor *tensor,
-                          const platform::Place &place)
-      : buf_(buf), tensor_(tensor), place_(place) {}
-
-  template <typename T>
-  void apply() {
-    *buf_ = tensor_->mutable_data<T>(place_);
-  }
-
-  void **buf_;
-  Tensor *tensor_;
-  platform::Place place_;
-};
-
-template <typename DeviceContext, typename T>
-class SparseTensorLoadKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    auto filename = ctx.Attr<std::string>("file_path");
-    std::ifstream fin(filename, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
-                      platform::errors::Unavailable(
-                          "Load operator fail to open file %s, please check "
-                          "whether the model file is complete or damaged.",
-                          filename));
-    auto name = ctx.OutputNames("Out")[0];
-    VLOG(4) << "Sparse Load Var name: " << name;
-    auto *out_var = ctx.OutputVar("Out");
-    PADDLE_ENFORCE_NOT_NULL(
-        out_var, platform::errors::InvalidArgument(
-                     "The variable %s to be loaded cannot be found.", name));
-    PADDLE_ENFORCE_EQ(out_var->IsType<paddle::framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "SparseLoad OP only support LoDTensor"));
-    LoadLodTensor(fin, place, out_var, ctx);
-  }
-
-  void LoadLodTensor(std::istream &is, const platform::Place &place,
-                     paddle::framework::Variable *var,
-                     const paddle::framework::ExecutionContext &ctx) const {
-    auto *tensor = var->GetMutable<paddle::framework::LoDTensor>();
-
-    auto node_index = ctx.Attr<int64_t>("node_index");
-    auto node_num = ctx.Attr<int64_t>("node_num");
-    auto shape = ctx.Attr<std::vector<int64_t>>("shape");
-    VLOG(4) << "Sparse LoadLodTensor node_num" << node_num;
-    VLOG(4) << "Sparse LoadLodTensor node_index" << node_index;
-    VLOG(4) << "Sparse LoadLodTensor shape[0]" << shape[0];
-    PADDLE_ENFORCE_GE(node_index, 0, platform::errors::InvalidArgument(
-                                         "node_num great than or equal to 0"));
-    PADDLE_ENFORCE_GE(node_num, 1, platform::errors::InvalidArgument(
-                                       "node_num great than or equal to 1"));
-
-    {
-      // the 1st field, unit32_t version for LoDTensor
-      uint32_t version;
-      is.read(reinterpret_cast<char *>(&version), sizeof(version));
-      PADDLE_ENFORCE_EQ(paddle::framework::IsTensorVersionSupported(version),
-                        true,
-                        platform::errors::InvalidArgument(
-                            "Tensor version %u is not supported.", version));
-      PADDLE_ENFORCE_EQ(version, 0U, platform::errors::InvalidArgument(
-                                         "Tensor version %u is not supported, "
-                                         "only version 0 is supported.",
-                                         version));
-    }
-
-    {
-      // the 2st field, LoD information
-      // Todo sparse load need change LoDTensor's lod level
-      uint64_t lod_level;
-      is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-      auto &lod = *tensor->mutable_lod();
-      lod.resize(lod_level);
-    }
-
-    // the 3st filed, Tensor
-
-    uint32_t version;
-    is.read(reinterpret_cast<char *>(&version), sizeof(version));
-
-    PADDLE_ENFORCE_EQ(
-        version, 0U,
-        platform::errors::InvalidArgument(
-            "tensor version %u is not supported, Only version 0 is supported",
-            version));
-
-    paddle::framework::proto::VarType::TensorDesc desc;
-
-    {  // int32_t size
-      // proto buffer
-      int32_t size;
-      is.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::unique_ptr<char[]> buf(new char[size]);
-      is.read(reinterpret_cast<char *>(buf.get()), size);
-      PADDLE_ENFORCE_EQ(
-          desc.ParseFromArray(buf.get(), size), true,
-          platform::errors::InvalidArgument("Cannot parse tensor desc"));
-    }
-
-    {  // read tensor
-      std::vector<int64_t> dims;
-      dims.reserve(static_cast<size_t>(desc.dims().size()));
-      std::copy(desc.dims().begin(), desc.dims().end(),
-                std::back_inserter(dims));
-
-      int64_t line_numel = 1;
-      for (size_t dim = 1; dim < dims.size(); dim++) {
-        line_numel *= dims[dim];
-      }
-      auto total_line = dims[0];
-
-      tensor->Resize(paddle::framework::make_ddim(shape));
-
-      void *buf;
-      auto ctx = platform::CPUDeviceContext();
-
-      paddle::framework::VisitDataType(
-          desc.data_type(),
-          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
-
-      auto line_size =
-          line_numel * paddle::framework::SizeOfType(desc.data_type());
-      char *cur_buf = static_cast<char *>(buf);
-      char *temp_row = new char[line_size];
-      VLOG(4) << "TensorFromStream: line_size " << line_size;
-      VLOG(4) << "TensorFromStream: total_line " << total_line;
-      for (size_t line_index = 0; line_index < static_cast<size_t>(total_line);
-           ++line_index) {
-        is.read(temp_row, line_size);
-        if (static_cast<int64_t>(line_index) % node_num == node_index) {
-          memcpy(cur_buf, temp_row, line_size);
-          cur_buf += line_size;
-        }
-      }
-    }
-  }
-};
-
-class SparseTensorLoadOp : public paddle::framework::OperatorWithKernel {
- public:
-  using paddle::framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(paddle::framework::InferShapeContext *ctx) const override {}
-
- protected:
-  paddle::framework::OpKernelType GetExpectedKernelType(
-      const paddle::framework::ExecutionContext &ctx) const override {
-    paddle::framework::OpKernelType kt = paddle::framework::OpKernelType(
-        paddle::framework::proto::VarType::FP32, ctx.GetPlace());
-    return kt;
-  }
-};
-
-class SparseTensorLoadOpMaker
-    : public paddle::framework::OpProtoAndCheckerMaker {
- public:
-  void Make() {
-    AddOutput("Out", "The LoDTensor / SelectedRows need to be loaded");
-    AddAttr<std::string>("file_path",
-                         R"(Variable will be loaded from "file_path")")
-        .AddCustomChecker(
-            [](const std::string &path) { return !path.empty(); });
-    AddAttr<int64_t>("node_index", "role id from 0 ~ node_num.").SetDefault(0);
-    AddAttr<int64_t>("node_num", "role nums which need load current varibale.")
-        .SetDefault(0);
-    AddAttr<std::vector<int64_t>>("shape",
-                                  "(vector<int64_t>) The shape of the output")
-        .SetDefault({});
-    AddComment(R"DOC(
-    SparseTensorLoad OP, Load sprase tensor on parameter server
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(sparse_tensor_load, ops::SparseTensorLoadOp,
-                  ops::SparseTensorLoadOpMaker);
-
-REGISTER_OP_CPU_KERNEL(
-    sparse_tensor_load,
-    ops::SparseTensorLoadKernel<paddle::platform::CPUDeviceContext, float>)
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cc b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
deleted file mode 100644
index 042a22b8ff19989cff887e1d8d343c4f26410f6b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_byref_op.h"
-#include "paddle/fluid/operators/split_op.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class SplitByrefOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Ids", "SplitByrefOp");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "SplitByrefOp");
-
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    auto sections = ctx->Attrs().Get<std::vector<int>>("sections");
-    const size_t outs_number = outs_names.size();
-    std::vector<framework::DDim> outs_dims;
-    outs_dims.reserve(outs_number);
-
-    if (num > 0) {
-      int64_t in_axis_dim = 0;
-      if (ctx->IsRuntime()) {
-        in_axis_dim = in_dims[0];
-      }
-      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, platform::errors::InvalidArgument(
-                                                  "tensor split does not result"
-                                                  " in an equal division"));
-      size_t out_axis_dim = in_axis_dim / num;
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[0] = out_axis_dim;
-        outs_dims.push_back(dim);
-      }
-    } else if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(
-          sections.size(), outs_number,
-          platform::errors::InvalidArgument("tensor split sections size"
-                                            "should be equal to output size"));
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[0] = sections[i];
-        outs_dims.push_back(dim);
-      }
-    }
-    ctx->SetOutputsDim("Out", outs_dims);
-  }
-};
-
-class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of the split operator.");
-    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-SplitByref operator
-
-Split source tensor to sevaral tensors by axis 0. No copy in this operator
-is performed, output tensor shares the same blocks of memory.
-)DOC");
-    AddAttr<std::vector<int>>("sections",
-                              "(vector<int>) "
-                              "the length of each output along the "
-                              "specified axis.")
-        .SetDefault(std::vector<int>{});
-    AddAttr<int>("num",
-                 "(int, default 0)"
-                 "Number of sub-tensors. This must evenly divide "
-                 "Input.dims()[axis]")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-// NOTE: concat op default axis must be 0!
-USE_CPU_ONLY_OP(concat);
-
-REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
-                  ops::SplitGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitGradMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(
-    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
deleted file mode 100644
index 86738b7c69e8b5a8c013f0992bf94f8de7feea00..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}")
-        .AsDuplicable();
-
-    AddOutput("Out", "(LoDTensors) The outputs of the input Ids.")
-        .AsDuplicable();
-
-    AddComment(R"DOC(
-Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
-Example:
-  Input:
-    X = [[1,2,3,4,5,6],[2,3]]
-
-  Out(3 output):
-    if compress is True:
-        out0 = [3, 3, 6]
-        out1 = [1, 4]
-        out2 = [2, 2, 5]
-    else:
-        out0 = [3, 6]
-        out1 = [1, 4]
-        out2 = [2, 5]
-)DOC");
-  }
-};
-
-class SplitIdsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("Ids"), "Input", "Ids", "SplitIdsOp");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", "SplitIdsOp");
-
-    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    auto ids_dims = ctx->GetInputsDim("Ids");
-    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          ids_dims[0].size(), 2,
-          platform::errors::InvalidArgument(
-              "ShapeError: The dimensions of the 'split_ids' must be 2. "
-              "But received split_ids's dimensions = %d, "
-              "split_ids's shape = [%s].",
-              ids_dims[0].size(), ids_dims[0]));
-    }
-  }
-
- protected:
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Ids"), ctx.GetPlace());
-  }
-};
-
-class SplitIdsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    auto input_type = ctx->GetInputType("Ids");
-    ctx->SetOutputType("Out", input_type, framework::ALL_ELEMENTS);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
-                  ops::SplitIdsOpInferVarType);
-
-REGISTER_OP_CPU_KERNEL(
-    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
-    ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.h b/paddle/fluid/operators/distributed_ops/split_ids_op.h
deleted file mode 100644
index 8a3ebe6e258e57205bb46675060162e673c7b300..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iterator>
-#include <set>
-#include <unordered_map>
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitIdsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto place = ctx.GetPlace();
-    if (!platform::is_cpu_place(place)) {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "SplitIds do not support GPU kernel"));
-    }
-
-    const auto ids_vars = ctx.MultiInputVar("Ids");
-
-    PADDLE_ENFORCE_GT(
-        ids_vars.size(), 0,
-        platform::errors::InvalidArgument(
-            ids_vars.size(), 0, "The number of Ids expected > 0, but got %d",
-            ids_vars.size()));
-    auto *ids_var = ids_vars[0];
-
-    if (ids_var->IsType<framework::LoDTensor>()) {
-      int batch_size = 0;
-      const auto ids_tensors = ctx.MultiInput<framework::LoDTensor>("Ids");
-      for (size_t i = 0; i < ids_tensors.size(); ++i) {
-        batch_size += ids_tensors[i]->dims()[0];
-      }
-      VLOG(4) << "Get Total BatchSize is: " << batch_size;
-
-      std::vector<T> all_ids(batch_size);
-      int offset = 0;
-      for (size_t i = 0; i < ids_tensors.size(); ++i) {
-        const auto *ids = ids_tensors[i];
-        std::memcpy(all_ids.data() + offset, ids->data<T>(),
-                    ids->numel() * sizeof(T));
-        offset += ids->numel();
-      }
-
-      std::set<T> st(all_ids.begin(), all_ids.end());
-      all_ids.assign(st.begin(), st.end());
-
-      auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-      const size_t shard_num = outs.size();
-      std::vector<std::vector<T>> out_ids;
-      out_ids.resize(outs.size());
-
-      // split id by their shard_num.
-      for (size_t i = 0; i < all_ids.size(); ++i) {
-        T id = all_ids[i];
-        size_t shard_id = static_cast<size_t>(id) % shard_num;
-        out_ids[shard_id].push_back(id);
-      }
-
-      // create tensor for each shard and send to parameter server
-      for (size_t i = 0; i < out_ids.size(); ++i) {
-        auto *shard_t = outs[i];
-        std::vector<T> ids = out_ids[i];
-        auto *shard_data = shard_t->mutable_data<T>(
-            framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-        for (size_t i = 0; i < ids.size(); ++i) {
-          shard_data[i] = ids[i];
-        }
-      }
-    } else if (ids_var->IsType<framework::SelectedRows>()) {
-      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
-      auto &ids_dims = ids_selected_rows->value().dims();
-      const T *ids_data = ids_selected_rows->value().data<T>();
-      const auto &ids_rows = ids_selected_rows->rows();
-      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-      const size_t shard_num = outs.size();
-      for (auto &out : outs) {
-        out->mutable_rows()->clear();
-      }
-      // get rows for outputs
-      std::unordered_map<int64_t, size_t> id_to_index;
-      for (size_t i = 0; i < ids_rows.size(); ++i) {
-        id_to_index[ids_rows[i]] = i;
-        size_t shard_id = static_cast<size_t>(ids_rows[i]) % shard_num;
-        outs[shard_id]->mutable_rows()->push_back(ids_rows[i]);
-      }
-
-      int64_t row_width = ids_dims[1];
-      for (auto &out : outs) {
-        out->set_height(ids_selected_rows->height());
-        framework::DDim ddim = framework::make_ddim(
-            {static_cast<int64_t>(out->rows().size()), row_width});
-        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
-        for (int64_t i = 0; i < ddim[0]; ++i) {
-          memcpy(output + i * row_width,
-                 ids_data + id_to_index[out->rows()[i]] * row_width,
-                 row_width * sizeof(T));
-        }
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "% should be LoDTensor or SelectedRows, but the received type is %s",
-          ctx.InputNames("Ids")[0], framework::ToTypeName(ids_var->Type())));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc b/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
deleted file mode 100644
index b65621a0886b02fd8d3c029c979348469014cadc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <string>
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/distributed/distributed.h"
-#include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/nccl_helper.h"
-#include "paddle/fluid/string/printf.h"
-
-#ifdef PADDLE_WITH_GRPC
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#endif
-
-USE_NO_KERNEL_OP(listen_and_serv);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-namespace distributed = paddle::operators::distributed;
-namespace string = paddle::string;
-
-std::unique_ptr<distributed::RPCServer> g_rpc_service;
-std::unique_ptr<distributed::RequestHandler> g_req_handler;
-
-void StartServer() {
-  f::Scope scope;
-  p::CPUPlace place;
-  scope.Var(NCCL_ID_VARNAME);
-  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(p::CPUPlace());
-
-  f::ProgramDesc empty_program;
-  f::Executor executor(dev_ctx.GetPlace());
-  g_req_handler->SetScope(&scope);
-  g_req_handler->SetDevCtx(&dev_ctx);
-  g_req_handler->SetProgram(&empty_program);
-  g_req_handler->SetExecutor(&executor);
-
-  g_rpc_service->RegisterRPC(distributed::kRequestSend, g_req_handler.get());
-  g_req_handler->SetRPCServer(g_rpc_service.get());
-
-  std::thread server_thread(
-      std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get()));
-
-  g_rpc_service->SetCond(distributed::kRequestSend);
-  g_rpc_service->WaitBarrier(distributed::kRequestSend);
-
-  LOG(INFO) << "got nccl id and stop server...";
-  g_rpc_service->ShutDown();
-  server_thread.join();
-}
-
-TEST(SendNcclId, RPCServer) {
-  g_req_handler.reset(
-      new distributed::RequestSendHandler(distributed::DistributedMode::kSync));
-  g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1));
-
-  std::thread server_thread(StartServer);
-  g_rpc_service->WaitServerReady();
-
-  f::Scope scope;
-  p::CPUPlace place;
-  p::DeviceContextPool& pool = p::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(p::CPUPlace());
-
-  auto var = scope.Var(NCCL_ID_VARNAME);
-  auto id = var->GetMutable<ncclUniqueId>();
-  p::dynload::ncclGetUniqueId(id);
-
-  int port = g_rpc_service->GetSelectedPort();
-
-  std::string ep = string::Sprintf("127.0.0.1:%d", port);
-
-  distributed::RPCClient* client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
-
-  LOG(INFO) << "connect to server" << ep;
-  client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME);
-  client->Wait();
-  client->AsyncSendBatchBarrier(ep);
-  client->Wait();
-
-  server_thread.join();
-  g_rpc_service.reset(nullptr);
-  g_req_handler.reset(nullptr);
-}
diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4fe9cf214eaa700326ea84dd6d4b3a6001c23365
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
@@ -0,0 +1,54 @@
+# compile flags
+set(DLNNE_FLAGS
+  -Wno-error=non-virtual-dtor
+  -Wno-error=unused-variable
+  -Wno-error=attributes
+  ${fsanitize}
+)
+foreach(flag ${DLNNE_FLAGS})
+  safe_set_cflag(CMAKE_C_FLAGS ${flag})
+  safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
+endforeach()
+
+
+# add nne
+find_path(DLNNE_INCLUDE_DIR dlnne.h
+  PATHS
+  $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include
+  NO_DEFAULT_PATH
+)
+
+find_library(DLNNE_LIB libdlnne.so
+  PATHS
+  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne
+  NO_DEFAULT_PATH
+)
+
+find_path(CUDA_INCLUDE_DIR cuda.h
+  $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include
+)
+
+find_library(CURT_LIB libcurt.so
+  PATHS
+  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
+  NO_DEFAULT_PATH
+)
+
+
+message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR})
+message("DLNNE_LIB: "${DLNNE_LIB})
+message("CUDA_INCLUDE_DIR: "${CUDA_INCLUDE_DIR})
+message("CURT_LIB: "${CURT_LIB})
+
+include_directories("${DLNNE_INCLUDE_DIR}")
+include_directories("${CUDA_INCLUDE_DIR}")
+
+op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost device_context op_registry scope)
+
+#message("PYBIND_FILE:${pybind_file}")
+#file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(dlnne_engine);\n")
+#endif()
+
+target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB})
+
+cc_test(test_dlnne_engine_op SRCS dlnne_engine_op_test.cc DEPS dlnne_engine_op analysis)
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4654e6a9f978a2885c369c97515aa1c6b1085245
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+
+namespace paddle {
+namespace inference {
+
+void CopyTensorDeviceToCpu(void* dst_ptr, void* src_ptr, int total_bytes) {
+  cudaDeviceSynchronize();
+  cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+}
+void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) {
+  cudaDeviceSynchronize();
+  cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+}
+
+}  // namespace inference
+
+namespace operators {
+
+class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different DLNNE Engines");
+    AddAttr<framework::BlockDesc*>("sub_block", "the trt block");
+    AddComment("Dlnne engine operator.");
+  }
+};
+
+class DlnneEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(dlnne_engine, ops::DlnneEngineOp, ops::DlnneEngineOpMaker);
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d426876c18fa5e7033c0787e8cec82758c3517e8
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -0,0 +1,351 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>          // NOTLINT
+#include <cuda_runtime.h>  // NOTLINT
+#include <dlnne.h>         // NOTLINT
+
+#include <assert.h>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace dl {
+namespace nne {
+class Builder;
+class Engine;
+class Network;
+class Parser;
+class ExecutionContext;
+}  // namespace nne
+}  // namespace dl
+
+namespace paddle {
+namespace inference {
+class NneDeleter {
+ public:
+  NneDeleter() {}
+
+  template <typename T>
+  inline void operator()(T *ptr) {
+    if (ptr != nullptr) {
+      ptr->Destroy();
+    }
+  }
+};
+
+void CopyTensorDeviceToCpu(void *dst_ptr, void *src_ptr, int total_bytes);
+
+void CopyTensorCpuToDevice(void *dst_ptr, void *src_ptr, int total_bytes);
+
+template <typename T>
+struct Singleton;
+}  // namespace inference
+}  // namespace paddle
+
+namespace paddle {
+
+namespace operators {
+
+class DlnneEngineOp : public framework::OperatorBase {
+ private:
+  std::vector<std::string> input_names_;
+  std::unordered_set<std::string> param_names_;
+  std::string engine_key_;
+  int num_inputs;
+  int num_outputs;
+  std::vector<std::string> output_names;
+  std::vector<std::string> input_names;
+
+  dl::nne::Builder *builder;
+  dl::nne::Parser *parser;
+  dl::nne::Network *network;
+  dl::nne::ExecutionContext *context;
+  dl::nne::Engine *engine;
+
+  unsigned int engine_input_size;
+  std::vector<int> InputIndexToBindIndex_;
+
+ public:
+  DlnneEngineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    input_names_ = Inputs("Xs");
+    engine_key_ = Attr<std::string>("engine_key");
+    auto params = Attr<std::vector<std::string>>("parameters");
+    for (const auto &param : params) {
+      param_names_.insert(param);
+    }
+
+    num_inputs = 0;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      num_inputs += 1;
+      input_names.push_back(x);
+    }
+
+    num_outputs = Outputs("Ys").size();
+    for (const auto &y : Outputs("Ys")) {
+      VLOG(4) << "y: " << y << std::endl;
+      output_names.push_back(y);
+    }
+
+    // onnx path
+    std::stringstream filename;
+    std::string current_path = ".";
+    char *buffer;
+    if ((buffer = getcwd(NULL, 0)) != NULL) {
+      current_path = buffer;
+    } else {
+      current_path = ".";
+    }
+    filename << current_path << "/dump/" << engine_key_ << "/" << engine_key_
+             << ".onnx";
+
+    builder = dl::nne::CreateInferBuilder();
+    PADDLE_ENFORCE_NE(builder, nullptr, platform::errors::Unavailable(
+                                            "nne create builder failed"));
+    parser = dl::nne::CreateParser();
+    PADDLE_ENFORCE_NE(parser, nullptr, platform::errors::Unavailable(
+                                           "nne create parser failed"));
+
+    network = builder->CreateNetwork();
+
+    LOG(INFO) << "set output for dlnne";
+    for (std::string &output_op_name : output_names)
+      parser->RegisterOutput(output_op_name.c_str());
+
+    LOG(INFO) << "parser onnx for dlnne";
+    parser->Parse(filename.str().c_str(), *network);
+
+    LOG(INFO) << "build network";
+    engine = builder->BuildEngine(*network);
+
+    // total size = input_size+output_size
+    engine_input_size = num_inputs + num_outputs;
+    for (std::string &input_name : input_names) {
+      int BindIndex = engine->GetBindingIndex(input_name.c_str());
+      InputIndexToBindIndex_.push_back(BindIndex);
+    }
+
+    for (std::string &output_name : output_names) {
+      int BindIndex = engine->GetBindingIndex(output_name.c_str());
+      InputIndexToBindIndex_.push_back(BindIndex);
+    }
+
+    // context
+    context = engine->CreateExecutionContext();
+  }
+
+  ~DlnneEngineOp() {
+    network->Destroy();
+    context->Destroy();
+    engine->Destroy();
+    parser->Destroy();
+    builder->Destroy();
+  }
+
+ protected:
+  void RunDlnneOnCreateEngine(const framework::Scope &scope,
+                              const platform::Place &dev_place) const {
+    PADDLE_ENFORCE_EQ(
+        input_names_.empty(), false,
+        platform::errors::PreconditionNotMet(
+            "Dlnne engine needs at least one input, but no input is found. "
+            "Please check if you set the input correctly."));
+
+    std::vector<void *> input_buffers(num_inputs);
+    std::vector<void *> cpu_input_buffers(num_inputs);
+    std::vector<std::vector<int64_t>> input_shapes(num_inputs);
+    std::vector<int32_t> input_data_types(num_inputs);
+    std::vector<int64_t> input_bytes(num_inputs);
+
+    int index = 0;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      // convert input and copy to Dlnne engine's buffer
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+
+      const int bind_index = index;
+      index++;
+      int64_t data_bytes;
+      int32_t dtype;
+      auto type = t.type();
+      data_bytes = 1;
+      void *buffer = nullptr;
+      if (type == framework::proto::VarType::FP32) {
+        buffer = static_cast<void *>(t.data<float>());
+        data_bytes = 4;
+        dtype = 0;
+      } else if (type == framework::proto::VarType::INT64) {
+        buffer = static_cast<void *>(t.data<int64_t>());
+        data_bytes = 8;
+        dtype = 1;
+      } else if (type == framework::proto::VarType::INT32) {
+        buffer = static_cast<void *>(t.data<int32_t>());
+        data_bytes = 4;
+        dtype = 2;
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "The DLNNE Engine OP only support float/int32_t/int64_t input."));
+      }
+      input_buffers[bind_index] = buffer;
+
+      auto t_shape = framework::vectorize<int64_t>(t.dims());
+      std::vector<int64_t> runtime_input_shape(t_shape.begin(), t_shape.end());
+      for (auto &size : t_shape) {
+        data_bytes = data_bytes * size;
+      }
+
+      VLOG(4) << "buffers_size:" << data_bytes;
+      cpu_input_buffers[bind_index] =
+          input_buffers[bind_index];  // malloc(data_bytes);
+      input_shapes[bind_index] = runtime_input_shape;
+      input_data_types[bind_index] = dtype;
+      input_bytes[bind_index] = data_bytes;
+    }
+
+    // output shape
+    std::vector<std::vector<int64_t>> out_shapes;
+    std::vector<int32_t> output_bytes;
+    for (int i = 0; i < num_outputs; i++) {
+      int index = engine->GetBindingIndex(output_names[i].c_str());
+      dl::nne::Dims out_dim = engine->GetBindingDimensions(index);
+      std::vector<int64_t> shape(out_dim.nbDims);
+      for (int dim = 0; dim < out_dim.nbDims; dim++) {
+        shape[dim] = (out_dim.d[dim]);
+      }
+
+      out_shapes.push_back(shape);
+      int64_t data_bytes;
+
+      // float32
+      data_bytes = 4;
+      for (auto &size : shape) {
+        data_bytes = data_bytes * size;
+      }
+      VLOG(4) << "data_bytes: " << data_bytes;
+      output_bytes.push_back(data_bytes);
+    }
+
+    int bind_index = 0;
+    std::vector<void *> cpu_output_buffers(num_outputs);
+    std::vector<void *> output_buffers(num_outputs);
+    std::vector<int32_t> output_dtypes(num_outputs);
+
+    for (const auto &y : Outputs("Ys")) {
+      auto *fluid_v = scope.FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(
+          fluid_v,
+          platform::errors::NotFound(
+              "Output variable %s is not found in DLNNE subgraph.", y));
+
+      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+
+      VLOG(4) << "out_shapes[bind_index] dim:" << out_shapes[bind_index].size();
+      fluid_t->Resize(framework::make_ddim(out_shapes[bind_index]));
+
+      int32_t dtype;
+      output_buffers[bind_index] = fluid_t->mutable_data<float>(
+          BOOST_GET_CONST(platform::CPUPlace, dev_place));
+      dtype = 0;
+      cpu_output_buffers[bind_index] =
+          output_buffers[bind_index];  // malloc(data_bytes);
+      output_dtypes[bind_index] = dtype;
+      bind_index++;
+    }
+
+    std::vector<void *> engine_input_ptr(engine_input_size);
+
+    // set input_ptr
+    for (unsigned int i = 0; i < engine_input_size; i++) {
+      if (InputIndexToBindIndex_[i] < 0) continue;
+
+      if (engine->BindingIsInput(InputIndexToBindIndex_[i])) {
+        // copy cpu buffer to gpu buffer
+        int64_t total_bytes;
+        total_bytes = input_bytes[i];
+        VLOG(4) << "input_bytes: " << total_bytes;
+
+        void *gpu_ptr;
+        cudaMalloc(&gpu_ptr, total_bytes);
+        engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
+
+        paddle::inference::CopyTensorCpuToDevice(
+            gpu_ptr, reinterpret_cast<void *>(cpu_input_buffers[i]),
+            total_bytes);
+
+      } else {
+        int64_t total_size;
+        total_size = output_bytes[i - input_names.size()];
+        VLOG(4) << "output_bytes: " << total_size;
+        void *gpu_ptr;
+        cudaMalloc(&gpu_ptr, total_size);
+        engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
+      }
+    }
+
+    clock_t startTime, endTime;
+    startTime = clock();
+    context->Execute(1, engine_input_ptr.data());
+    endTime = clock();
+    double during_ms =
+        static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC * 1000;
+    LOG(INFO) << "dlNNE execute time: " << during_ms << " ms";
+
+    bind_index = 0;
+    for (unsigned int i = 0; i < engine_input_size; i++) {
+      if (InputIndexToBindIndex_[i] < 0) continue;
+
+      if (i >= input_names.size()) {
+        void *cpu_ptr = cpu_output_buffers[i - input_names.size()];
+        int64_t size;
+        size = output_bytes[i - input_names.size()];
+        paddle::inference::CopyTensorDeviceToCpu(
+            cpu_ptr, engine_input_ptr[InputIndexToBindIndex_[i]], size);
+        // dtype: float32
+        int32_t dtypes;
+        dtypes = 0;
+
+        cpu_output_buffers[bind_index] = cpu_ptr;
+        output_dtypes[bind_index] = dtypes;
+        bind_index++;
+      }
+      cudaFree(engine_input_ptr[InputIndexToBindIndex_[i]]);
+    }
+  }
+
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    RunDlnneOnCreateEngine(scope, dev_place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..caf1a80fcc737f1883ecb7b94e43e383e8b830d4
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
@@ -0,0 +1,237 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
+
+USE_NO_KERNEL_OP(dlnne_engine);
+namespace paddle {
+namespace operators {
+
+namespace {
+void CreateCUDATensor(framework::Scope* scope, const std::string& name,
+                      const std::vector<int64_t>& shape) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
+}
+
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  *var = *desc.Proto();
+}
+
+}  // namespace
+
+using inference::analysis::SetAttr;
+
+TEST(DlnneEngineOp, manual) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  LOG(INFO) << "create fc op";
+  auto* fc0 = block_desc.AppendOp();
+  fc0->SetType("fc");
+  fc0->SetInput("X", std::vector<std::string>({"x"}));     // 4 x 1 x 1
+  fc0->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  fc0->SetOutput("Out", std::vector<std::string>({"z"}));  // 6 x 1 x 1
+
+  LOG(INFO) << "create fc op";
+  auto* fc1 = block_desc.AppendOp();
+  fc1->SetType("fc");
+  fc1->SetInput("X", std::vector<std::string>({"z"}));
+  fc1->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc1->SetOutput("Out", std::vector<std::string>({"z0"}));  // 8 x 1 x 1
+
+  // Set inputs' variable shape in BlockDesc
+  // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
+
+  // It is wired, need to copy manually.
+  *block_->add_ops() = *fc0->Proto();
+  *block_->add_ops() = *fc1->Proto();
+
+  ASSERT_EQ(block_->ops_size(), 2);
+
+  LOG(INFO) << "create dlnne desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("dlnne_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
+  engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("a_calib_engine"));
+  engine_op_desc.SetAttr("predictor_id", 1);
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z0"}));
+  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
+
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
+  LOG(INFO) << "engine_op " << engine_op.get();
+
+  framework::Scope scope;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  // Prepare variables.
+  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
+
+  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+}
+
+void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  using shape_t = std::vector<int64_t>;
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+
+  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
+                        const std::string& z_name, bool x_created,
+                        const shape_t& x_shape, const shape_t& y_shape,
+                        const shape_t& z_shape) {
+    LOG(INFO) << "create fc op";
+    auto* fc = block_desc.AppendOp();
+    fc->SetType("mul");
+    fc->SetInput("X", std::vector<std::string>({x_name}));
+    fc->SetInput("Y", std::vector<std::string>({y_name}));
+    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+
+    // Set inputs' variable shape in BlockDesc
+    if (!x_created) {
+      AddTensorToBlockDesc(block_, x_name,
+                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
+    }
+    AddTensorToBlockDesc(block_, y_name,
+                         std::vector<int64_t>({input_dim, output_dim}));
+    AddTensorToBlockDesc(block_, z_name,
+                         std::vector<int64_t>({batch_size, output_dim}));
+
+    // Prepare variables.
+    if (!x_created) {
+      CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
+    }
+    CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
+
+    // It is wired, need to copy manually.
+    *block_->add_ops() = *fc->Proto();
+  };
+
+  // Test with 4 layer FC
+  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
+             {input_dim, output_dim}, {batch_size, output_dim});
+  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+
+  LOG(INFO) << "create dlnne desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("dlnne_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters",
+                         std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("b_calib_engine"));
+  engine_op_desc.SetAttr("predictor_id", 1);
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z3"}));
+  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
+
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
+
+  // Execute them.
+  engine_op->Run(scope, place);
+}
+
+// Test with a larger FC layer.
+TEST(DlnneEngineOp, fc) { Execute(40, 28, 28); }
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 0b0b7f69b9d849a666803b8c2e7f57f684c7ab9e..0987118ba39b6ec6893ea3914a30ff477c42d6a6 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -160,7 +160,7 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
                   const Tensor* tensor_dout, Tensor* tensor_dx,
                   Tensor* tensor_dy,
                   const paddle::framework::ExecutionContext& ctx) {
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)
     if (1 == tensor_dout->dims().size()) {
       auto dout = framework::EigenVector<T>::Flatten(*tensor_dout);
 
@@ -205,35 +205,25 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
       }
     }
 #else
-    const auto* data_dout = tensor_dout->data<T>();
+    auto const *x = tensor_x->data<T>(), *y = tensor_y->data<T>(),
+               *dz = tensor_dout->data<T>();
+    auto&& d = tensor_x->dims();
+    auto const N = tensor_x->numel();
+    auto const B = d[d.size() - 1];
 
     if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
-      const auto* data_y = tensor_y->data<T>();
-      const framework::DDim& dim = tensor_x->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dx[i] = data_y[i] * data_dout[s];
+      auto* dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
+      for (auto j = 0; j < N / B; ++j) {
+        auto const ss = dz[j];
+        for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss;
       }
     }
 
     if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
-      const auto* data_x = tensor_x->data<T>();
-      const framework::DDim& dim = tensor_y->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dy[i] = data_x[i] * data_dout[s];
+      auto* dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
+      for (auto j = 0; j < N / B; ++j) {
+        auto const ss = dz[j];
+        for (auto i = 0; i < B; i++) *dy++ = *x++ * ss;
       }
     }
 #endif
@@ -266,21 +256,20 @@ class DotKernel : public framework::OpKernel<T> {
       out.device(dev) = (x * y).sum(Eigen::DSizes<int, 1>(1));
     }
 #else
-    const auto* data_x = tensor_x->data<T>();
-    const auto* data_y = tensor_y->data<T>();
-    auto* data_out = tensor_out->data<T>();
-
-    auto x_dims = tensor_x->dims();
-    auto step = x_dims[x_dims.size() - 1];
-    int size = static_cast<int>(framework::product(x_dims));
-
-    for (int ind = -1, j = 0; j < size; ++j) {
-      if (j % step == 0) {
-        ++ind;
-        data_out[ind] = data_x[j] * data_y[j];
-      } else {
-        data_out[ind] += data_x[j] * data_y[j];
-      }
+    auto const *x = tensor_x->data<T>(), *x_ = &x[0];
+    auto const *y = tensor_y->data<T>(), *y_ = &y[0];
+    auto* z = tensor_out->data<T>();
+
+    // Loop over the total N elements of both operands while sum-reducing every
+    // B pairs along the way where B is the dimension of the least ordered axis
+    auto&& d = tensor_x->dims();
+    auto const N = tensor_x->numel();
+    auto const B = d[d.size() - 1];
+
+    for (int j = 0; j < N / B; j++) {
+      T ss = 0;
+      for (int i = 0; i < B; i++) ss += (*x_++) * (*y_++);
+      z[j] = ss;
     }
 #endif
   }
diff --git a/paddle/fluid/operators/eigen/CMakeLists.txt b/paddle/fluid/operators/eigen/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..848bf2433c5e394bf00f4b335b83da4e0fdec144
--- /dev/null
+++ b/paddle/fluid/operators/eigen/CMakeLists.txt
@@ -0,0 +1,10 @@
+file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+cc_library(eigen_cc_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
+if(WITH_GPU OR WITH_ROCM)
+  file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
+  if(WITH_GPU)
+    nv_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3)
+  elseif(WITH_ROCM)
+    hip_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3)
+  endif()
+endif()
diff --git a/paddle/fluid/operators/eigen/broadcast.cc b/paddle/fluid/operators/eigen/broadcast.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dab25f95493726a1b9a459ea0f6f3f33ad7bb22e
--- /dev/null
+++ b/paddle/fluid/operators/eigen/broadcast.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenBroadcast<Eigen::DefaultDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in,
+                   const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out,
+                   InType32BitIndex in, const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+};
+
+template <typename T, int Rank>
+struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in,
+                   const Array& reduce_dims, const Array2& reshape_dims) {
+    out.device(dev) =
+        in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions());
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, T)                      \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 1>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 2>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 3>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 4>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 5>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 6>
+INSTANTIATION(EigenBroadcast, bool);
+INSTANTIATION(EigenBroadcast, platform::float16);
+INSTANTIATION(EigenBroadcast, float);
+INSTANTIATION(EigenBroadcast, double);
+INSTANTIATION(EigenBroadcast, int);
+INSTANTIATION(EigenBroadcast, int64_t);
+INSTANTIATION(EigenBroadcastGrad, bool);
+INSTANTIATION(EigenBroadcastGrad, float);
+INSTANTIATION(EigenBroadcastGrad, platform::float16);
+INSTANTIATION(EigenBroadcastGrad, double);
+INSTANTIATION(EigenBroadcastGrad, int);
+INSTANTIATION(EigenBroadcastGrad, int64_t);
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, float, 0>;
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, double, 0>;
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, int, 0>;
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, int64_t, 0>;
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/broadcast.cu b/paddle/fluid/operators/eigen/broadcast.cu
new file mode 100644
index 0000000000000000000000000000000000000000..63e244d393a9bcbbc4537fadd7c8bc996643e43f
--- /dev/null
+++ b/paddle/fluid/operators/eigen/broadcast.cu
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenBroadcast<Eigen::GpuDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in,
+                   const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out,
+                   InType32BitIndex in, const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+};
+
+template <typename T, int Rank>
+struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in,
+                   const Array& reduce_dims, const Array2& reshape_dims) {
+    out.device(dev) =
+        in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions());
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, T)                  \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 1>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 2>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 3>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 4>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 5>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 6>
+INSTANTIATION(EigenBroadcast, bool);
+INSTANTIATION(EigenBroadcast, platform::float16);
+INSTANTIATION(EigenBroadcast, float);
+INSTANTIATION(EigenBroadcast, double);
+INSTANTIATION(EigenBroadcast, int);
+INSTANTIATION(EigenBroadcast, int64_t);
+INSTANTIATION(EigenBroadcastGrad, bool);
+INSTANTIATION(EigenBroadcastGrad, float);
+INSTANTIATION(EigenBroadcastGrad, platform::float16);
+INSTANTIATION(EigenBroadcastGrad, double);
+INSTANTIATION(EigenBroadcastGrad, int);
+INSTANTIATION(EigenBroadcastGrad, int64_t);
+template struct EigenBroadcastGrad<Eigen::GpuDevice, float, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, platform::float16, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, double, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, int, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, int64_t, 0>;
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/eigen_function.h b/paddle/fluid/operators/eigen/eigen_function.h
new file mode 100644
index 0000000000000000000000000000000000000000..59669505959f3f2b9d2b5d378e1e0b297df1718e
--- /dev/null
+++ b/paddle/fluid/operators/eigen/eigen_function.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace operators {
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenBroadcast {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  static void Eval(const EigenDevice& dev, OutType out, InType in,
+                   const Array& bcast);
+  static void Eval(const EigenDevice& dev, OutType32BitIndex out,
+                   InType32BitIndex in, const Array& bcast);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenBroadcastGrad {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, InType in,
+                   const Array& reduce_dims, const Array2& reshape_dims);
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt
index 06ca98e526e95b414584f9634a3d42f84d6b369f..216a3f79d6f920ff996f0b0788565d52d5bf3aff 100644
--- a/paddle/fluid/operators/elementwise/CMakeLists.txt
+++ b/paddle/fluid/operators/elementwise/CMakeLists.txt
@@ -8,3 +8,7 @@ register_operators(DEPS op_version_registry)
 cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
 cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor)
 cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor)
+
+if(WITH_ASCEND_CL)
+cc_test(elementwise_op_npu_test SRCS elementwise_op_npu_test.cc DEPS op_registry elementwise_add_op elementwise_sub_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 8de6416065d9a7d82f1c1ce2623c047ef24641ba..dc9c18ba038861b763cb52863ddae8ac69db5022 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -23,51 +23,61 @@ namespace plat = paddle::platform;
 namespace paddle {
 namespace operators {
 
+/*
+   input: an array;
+   return: the result of the math functor
+   1. For Unary Op, the length of input array is 1,
+      e.g. Relu: return args[0] > 0 ? args[0] : 0;
+   2. For Binary Op, the length of input array is 2,
+      e.g. Add: return args[0] + args[1];
+*/
 template <typename T>
-struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    AddRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x->numel());
-    for_range(functor);
+struct CudaAddFunctor {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] + args[1];
   }
 };
 
-template <>
-struct SameDimsElemwiseAdd<platform::CUDADeviceContext, platform::float16> {
+template <typename T>
+struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
   void operator()(const framework::ExecutionContext& ctx,
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
-    auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
-                              PADDLE_CUDA_THREAD_SIZE,
-                          1);
-    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    const half* x2 =
-        reinterpret_cast<const half*>(x->data<platform::float16>());
-    const half* y2 =
-        reinterpret_cast<const half*>(y->data<platform::float16>());
-    half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-    SameDimsElemwiseAddCUDAKernel<<<
-        grid_size, block_size, 0,
-        ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
-        x2, y2, z2, size);
+    std::vector<const framework::Tensor*> ins = {x, y};
+    std::vector<framework::Tensor*> outs = {z};
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        ctx.template device_context<platform::CUDADeviceContext>(), ins, &outs,
+        CudaAddFunctor<T>());
   }
 };
 
 template <typename T>
-static __global__ void SimpleElemwiseAddGradCUDAKernel(const T* dout,
-                                                       int64_t size, T* dx,
-                                                       T* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
+static __global__ void SimpleElemwiseAddGradCUDAKernel(
+    const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  int loop = size / vec_size;
+  int remainder = size % vec_size;
+  const float4* dout_vec = reinterpret_cast<const float4*>(dout);
+  float4* dx_vec = reinterpret_cast<float4*>(dx);
+  float4* dy_vec = reinterpret_cast<float4*>(dy);
+  float4 tmp_loop;
+
+  for (int i = tid; i < loop; i += stride) {
+    tmp_loop = dout_vec[i];
+    dx_vec[i] = tmp_loop;
+    dy_vec[i] = tmp_loop;
+  }
 
-  while (col < size) {
-    dx[col] = dout[col];
-    dy[col] = dout[col];
-    col += blockDim.x * gridDim.x;
+  if (tid == loop && remainder != 0) {
+    T tmp_rem;
+    while (remainder) {
+      int idx = size - remainder;
+      remainder--;
+      tmp_rem = dout[idx];
+      dx[idx] = tmp_rem;
+      dy[idx] = tmp_rem;
+    }
   }
 }
 
@@ -79,15 +89,39 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
                      const framework::Tensor* out,
                      const framework::Tensor* dout, framework::Tensor* dx,
                      framework::Tensor* dy) {
-  dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-  auto size = x->numel();
-  dim3 grid_size =
-      dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
-  SimpleElemwiseAddGradCUDAKernel<
-      T><<<grid_size, block_size, 0,
-           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-      dout->data<T>(), size, dx->mutable_data<T>(ctx.GetPlace()),
-      dy->mutable_data<T>(ctx.GetPlace()));
+  auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+  auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
+  auto* dout_data = dout->data<T>();
+  if (dx_data == dout_data && dy_data != dout_data) {
+    VLOG(4) << "Special case when dx_data is the same as dout_data, "
+               "only need copy dout to dy";
+    framework::TensorCopy(
+        *dout, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), dy);
+  } else if (dx_data != dout_data && dy_data == dout_data) {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "only need copy dout to dx";
+    framework::TensorCopy(
+        *dout, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), dx);
+  } else if (dx_data != dout_data && dy_data != dout_data) {
+    auto size = x->numel();
+    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
+    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
+    dim3 grid_size =
+        dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) /
+                 PADDLE_CUDA_THREAD_SIZE,
+             1);
+    SimpleElemwiseAddGradCUDAKernel<
+        T><<<grid_size, block_size, 0,
+             ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
+        dout->data<T>(), size, vec_size, dx->mutable_data<T>(ctx.GetPlace()),
+        dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "and dx_data is the same as dout_data, do not need "
+               "any operator";
+  }
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3768748931ded2a2541484bef2c8c37e72adda13
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -0,0 +1,164 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
+    // default axis=-1?
+    // So, the sub_grad should do reduce if needed.
+    // For example, the shape of each variable in elementwise_sub:
+    // x, dx: [2, 3, 5]
+    // y, dy: [1, 5]
+    // out, dout: [2, 3, 5]
+    // Then, out = x - y  =>  dx = dout, dy = -dout
+    // And, the shape of dy can be computed by two stages reduce,
+    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
+    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
+
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // For dx
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dx->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dx->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dx->dims().size(); ++i) {
+        if (dx->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        framework::TensorCopy(
+            *tmp_dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dx);
+      }
+    }
+
+    if (dy) {
+      // For dy
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dy->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dout->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dy->dims().size(); ++i) {
+        if (dy->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        dy->mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        framework::TensorCopy(
+            *tmp_dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dy);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel<float>,
+                       ops::ElementwiseAddNPUKernel<plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
+                       ops::ElementwiseAddGradNPUKernel<float>,
+                       ops::ElementwiseAddGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 96583d06571c82ced0915046bf1bc78c9c7d8be8..0cf9294c9de67fe4e7f2f32ff96c53586c8e860b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -43,7 +43,7 @@ struct SameDimsElemwiseDiv<platform::CUDADeviceContext, platform::float16> {
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
+    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
                               PADDLE_CUDA_THREAD_SIZE,
                           1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8852f3a419adc51d311178175fd6f71a8c628540
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor y_power(y->type());
+    y_power.mutable_data<T>(y->dims(), place);
+    auto y_power_runner = NpuOpRunner("Power", {*y}, {y_power},
+                                      {{"power", static_cast<float>(-1)}});
+    y_power_runner.Run(stream);
+
+    if (dx) {
+      dx->mutable_data<T>(place);
+
+      Tensor tensor_zeros(x->type());
+      tensor_zeros.mutable_data<T>(x->dims(), place);
+      auto tensor_zeros_runner =
+          NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
+      tensor_zeros_runner.Run(stream);
+
+      Tensor x_zero(paddle::framework::proto::VarType::BOOL);
+      x_zero.mutable_data<bool>(x->dims(), place);
+      auto x_zero_runner =
+          NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
+      x_zero_runner.Run(stream);
+
+      Tensor x_nozero(paddle::framework::proto::VarType::BOOL);
+      x_nozero.mutable_data<bool>(x->dims(), place);
+      auto x_nozero_runner =
+          NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
+      x_nozero_runner.Run(stream);
+
+      Tensor x_nozero_f(x->type());
+      x_nozero_f.mutable_data<T>(x->dims(), place);
+      auto x_nozero_f_runner =
+          NpuOpRunner("Cast", {x_nozero}, {x_nozero_f},
+                      {{"dst_type", static_cast<int32_t>(0)}});
+      x_nozero_f_runner.Run(stream);
+
+      Tensor x_grad_w(x->type());
+      x_grad_w.mutable_data<T>(x->dims(), place);
+      auto x_grad_w_runner =
+          NpuOpRunner("Mul", {x_nozero_f, y_power}, {x_grad_w}, {});
+      x_grad_w_runner.Run(stream);
+
+      auto x_grad_runner = NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
+      x_grad_runner.Run(stream);
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(place);
+
+      Tensor neg_out(y->type());
+      neg_out.mutable_data<T>(y->dims(), place);
+      auto neg_out_runner = NpuOpRunner("Neg", {*out}, {neg_out}, {});
+      neg_out_runner.Run(stream);
+
+      Tensor y_grad_w(y->type());
+      y_grad_w.mutable_data<T>(y->dims(), place);
+      auto y_grad_w_runner = NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {});
+      y_grad_w_runner.Run(stream);
+
+      auto y_grad_runner = NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {});
+      y_grad_runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                     paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da0116114747fa2e44045b75f3bd9bd0dc73d980
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
@@ -0,0 +1,52 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(elementwise_floordiv,
+                       ops::ElementwiseFloorDivNPUKernel<int>,
+                       ops::ElementwiseFloorDivNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3cdb6420e8ee1d159ecd525ab6a2360544ca5323
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMaxNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..987c250d651475d44da7e2ebf88222b74e5b5af0
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 5b598ab2d788ebf03a86c79b6346d46d0889aa2c..e01b5eb5fb73d9aca7de318276014f29576040a9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -43,7 +43,7 @@ struct SameDimsElemwiseMul<platform::CUDADeviceContext, platform::float16> {
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
+    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
                               PADDLE_CUDA_THREAD_SIZE,
                           1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08df6d4e27af0a79123f26ad2064ee0203cc1b28
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (dx) {
+      dx->mutable_data<T>(place);
+      auto dx_runner = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {});
+      dx_runner.Run(stream);
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(place);
+      auto dy_runner = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {});
+      dy_runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_mul,
+    ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_mul_grad,
+    ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                     paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 3cec5388d11f9764e3ab6d64d944ea2a9a0ed439..c7e7099e5f78c123ef5d788edc8066bed73e98ee 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -278,7 +278,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
 
 #ifdef PADDLE_WITH_MKLDNN
     // If broadcasting is needed, use native implementation
-    auto CanMKLDNNElementwiseAddGradBeUsed = [&]() {
+    auto CanMKLDNNElementwiseGradBeUsed = [&]() {
       auto dx_dims = ctx.Input<Tensor>("X")->dims();
       auto dy_dims = ctx.Input<Tensor>("Y")->dims();
       // No broadcast or broadcasting of data on inner dims is supported
@@ -286,8 +286,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     };
 
     if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
-        (ctx.Type() != "elementwise_add_grad" ||
-         CanMKLDNNElementwiseAddGradBeUsed())) {
+        CanMKLDNNElementwiseGradBeUsed()) {
       return framework::OpKernelType(input_data_type, ctx.GetPlace(),
                                      framework::DataLayout::kMKLDNN,
                                      framework::LibraryType::kMKLDNN);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
index 1121d0ef68ce2f498c8e945dbf2e65102ebae824..8344b3d9838b007dc284ffc18d011cbb98808fbc 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
@@ -18,7 +18,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#ifdef __HIPCC__
+#define PADDLE_CUDA_THREAD_SIZE 256
+#else
 #define PADDLE_CUDA_THREAD_SIZE 512
+#endif
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -34,10 +38,6 @@ limitations under the License. */
 #endif
 #endif  // PADDLE_WITH_HIP
 
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
-#define __h2div h2div
-#endif
-
 #define DIV_ERROR_INFO                                                     \
   "InvalidArgumentError: Integer division by zero encountered in divide. " \
   "Please check.\n"
@@ -162,32 +162,62 @@ inline DEVICE half2 half2_div(const half2& a, const half2& b) {
 #endif
 }
 
-#define DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Func, expr, FP16Function)           \
-  template <typename T>                                                      \
-  __global__ void SameDimsElemwise##Func##CUDAKernel(const T* x, const T* y, \
-                                                     T* z, int64_t size) {   \
-    int col = blockIdx.x * blockDim.x + threadIdx.x;                         \
-    while (col < size) {                                                     \
-      z[col] = x[col] expr y[col];                                           \
-      col += blockDim.x * gridDim.x;                                         \
-    }                                                                        \
-  }                                                                          \
-  template <>                                                                \
-  inline __global__ void SameDimsElemwise##Func##CUDAKernel<half>(           \
-      const half* x, const half* y, half* z, int64_t size) {                 \
-    int start = threadIdx.x + blockDim.x * blockIdx.x;                       \
-    int stride = blockDim.x * gridDim.x;                                     \
-    int n2 = size / 2;                                                       \
-    const half2* x2 = reinterpret_cast<const half2*>(x);                     \
-    const half2* y2 = reinterpret_cast<const half2*>(y);                     \
-    half2* z2 = reinterpret_cast<half2*>(z);                                 \
-    for (int i = start; i < n2; i += stride) {                               \
-      z2[i] = FP16Function(x2[i], y2[i]);                                    \
-    }                                                                        \
-    if (start == 0 && (size % 2)) {                                          \
-      z[size - 1] = __float2half(__half2float(x[size - 1])                   \
-                                     expr __half2float(y[size - 1]));        \
-    }                                                                        \
+#define DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Func, expr, FP16Function)             \
+  inline __global__ void SameDimsElemwise##Func##CUDAKernel(                   \
+      const float* __restrict__ x, const float* __restrict__ y, float* z,      \
+      int64_t size) {                                                          \
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;                           \
+    int stride = gridDim.x * blockDim.x;                                       \
+    int loop = size / 4;                                                       \
+    int remainder = size % 4;                                                  \
+    const float4* x_vec = reinterpret_cast<const float4*>(x);                  \
+    const float4* y_vec = reinterpret_cast<const float4*>(y);                  \
+    float4* z_vec = reinterpret_cast<float4*>(z);                              \
+    float4 x_f4, y_f4;                                                         \
+    for (int i = tid; i < loop; i += stride) {                                 \
+      x_f4 = x_vec[i];                                                         \
+      y_f4 = y_vec[i];                                                         \
+      z_vec[i] = make_float4(x_f4.x expr y_f4.x, x_f4.y expr y_f4.y,           \
+                             x_f4.z expr y_f4.z, x_f4.w expr y_f4.w);          \
+    }                                                                          \
+    if (tid == loop && remainder != 0) {                                       \
+      while (remainder) {                                                      \
+        int idx = size - remainder;                                            \
+        remainder--;                                                           \
+        z[idx] = x[idx] expr y[idx];                                           \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  inline __global__ void SameDimsElemwise##Func##CUDAKernel(                   \
+      const half* __restrict__ x, const half* __restrict__ y, half* z,         \
+      int64_t size) {                                                          \
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;                           \
+    int stride = gridDim.x * blockDim.x;                                       \
+    int loop = size / 8;                                                       \
+    int remainder = size % 8;                                                  \
+    const float4* x_vec = reinterpret_cast<const float4*>(x);                  \
+    const float4* y_vec = reinterpret_cast<const float4*>(y);                  \
+    float4* z_vec = reinterpret_cast<float4*>(z);                              \
+    float4 x_h8, y_h8, z_h8;                                                   \
+    for (int i = tid; i < loop; i += stride) {                                 \
+      x_h8 = x_vec[i];                                                         \
+      y_h8 = y_vec[i];                                                         \
+      half2* x_h2 = reinterpret_cast<half2*>(&x_h8);                           \
+      half2* y_h2 = reinterpret_cast<half2*>(&y_h8);                           \
+      half2* z_h2 = reinterpret_cast<half2*>(&z_h8);                           \
+      z_h2[0] = FP16Function(x_h2[0], y_h2[0]);                                \
+      z_h2[1] = FP16Function(x_h2[1], y_h2[1]);                                \
+      z_h2[2] = FP16Function(x_h2[2], y_h2[2]);                                \
+      z_h2[3] = FP16Function(x_h2[3], y_h2[3]);                                \
+      z_vec[i] = z_h8;                                                         \
+    }                                                                          \
+    if (tid == loop && remainder != 0) {                                       \
+      while (remainder) {                                                      \
+        int idx = size - remainder;                                            \
+        remainder--;                                                           \
+        z[idx] = __float2half(__half2float(x[idx]) expr __half2float(y[idx])); \
+      }                                                                        \
+    }                                                                          \
   }
 DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Add, +, half2_add)
 DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Sub, -, half2_sub)
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index c69baadb3c22e47cacd65d2eefea15da804b7fac..32e49cf3996f120d2e2a8f909883e0c46f7b1352 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -39,7 +39,11 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#ifdef __HIPCC__
+constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
+#else
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
+#endif
 #define BLOCK_X 32
 #define BLOCK_Y 32
 #endif
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..38b1afbdc3342e8bc4d9901b64bae808fd9d3915
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -0,0 +1,221 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+#ifdef __HIPCC__
+#define ELEMENTWISE_BLOCK_SIZE 256
+#else
+#define ELEMENTWISE_BLOCK_SIZE 512
+#endif
+
+namespace paddle {
+namespace operators {
+
+enum ElementwiseType { kUnary = 1, kBinary = 2 };
+
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) CudaAlignedVector {
+  T val[Size];
+};
+
+template <typename T>
+int GetVectorizedSizeImpl(const T *pointer) {
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec4 =
+      std::alignment_of<CudaAlignedVector<T, 4>>::value;  // NOLINT
+  constexpr int vec2 =
+      std::alignment_of<CudaAlignedVector<T, 2>>::value;  // NOLINT
+  if (address % vec4 == 0) {
+    return 4;
+  } else if (address % vec2 == 0) {
+    return 2;
+  }
+  return 1;
+}
+
+template <typename InT, typename OutT>
+int GetVectorizedSize(const std::vector<const framework::Tensor *> &ins,
+                      const std::vector<framework::Tensor *> &outs) {
+  int vec_size = 4;
+  for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
+    vec_size =
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<InT>()));
+  }
+  for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
+    vec_size =
+        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<OutT>()));
+  }
+  return vec_size;
+}
+
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT>
+struct ElementwiseDataWrapper {
+  OutT *out;
+  const InT *in0;
+  const InT *in1;
+  __device__ ElementwiseDataWrapper(OutT *out, const InT *in0,
+                                    const InT *in1 = nullptr)
+      : out(out), in0(in0), in1(in1) {}
+
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+
+  inline __device__ void load_vector(InVecType args[], int idx) {
+    const InVecType *x_vec = reinterpret_cast<const InVecType *>(in0);
+    args[0] = x_vec[idx];
+    if (ET == ElementwiseType::kBinary) {
+      const InVecType *y_vec = reinterpret_cast<const InVecType *>(in1);
+      args[1] = y_vec[idx];
+    }
+  }
+
+  inline __device__ void load_scalar(InT args[], int idx) {
+    args[0] = in0[idx];
+    if (ET == ElementwiseType::kBinary) {
+      args[1] = in1[idx];
+    }
+  }
+
+  inline __device__ void store_vector(OutVecType res, int idx) {
+    OutVecType *out_vec = reinterpret_cast<OutVecType *>(out);
+    out_vec[idx] = res;
+  }
+
+  inline __device__ void store_scalar(OutT res, int idx) { out[idx] = res; }
+};
+
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
+__device__ void VectorizedKernelImpl(
+    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
+    int tid) {
+  using InVecType = CudaAlignedVector<InT, VecSize>;
+  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+  InVecType ins_vec[ET];
+  OutVecType out_vec;
+  InT *ins_ptr[ET];
+  OutT *out_ptr;
+#pragma unroll
+  for (int i = 0; i < ET; ++i) {
+    ins_ptr[i] = reinterpret_cast<InT *>(&(ins_vec[i]));
+  }
+  out_ptr = reinterpret_cast<OutT *>(&out_vec);
+
+  // load
+  data.load_vector(ins_vec, tid);
+
+// compute
+#pragma unroll
+  for (int i = 0; i < VecSize; ++i) {
+    InT ins[ET];
+#pragma unroll
+    for (int j = 0; j < ET; ++j) {
+      ins[j] = ins_ptr[j][i];
+    }
+    out_ptr[i] = func(ins);
+  }
+
+  // store
+  data.store_vector(out_vec, tid);
+}
+
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
+__device__ void ScalarKernelImpl(
+    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
+    int start, int remain) {
+  InT ins[ET];
+  OutT out;
+
+  for (int i = 0; i < remain; ++i) {
+    int idx = start + i;
+    // load
+    data.load_scalar(ins, idx);
+    // compute
+    out = func(ins);
+    // store
+    data.store_scalar(out, idx);
+  }
+}
+
+template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
+          typename Functor>
+__global__ void VectorizedKernel(const InT *__restrict__ in0,
+                                 const InT *__restrict__ in1, OutT *out,
+                                 int size, Functor func) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int remain = size - VecSize * tid;
+  remain = remain > 0 ? remain : 0;
+  auto data = ElementwiseDataWrapper<ET, VecSize, InT, OutT>(out, in0, in1);
+  if (remain >= VecSize) {
+    VectorizedKernelImpl(data, func, tid);
+  } else {
+    ScalarKernelImpl(data, func, tid * VecSize, remain);
+  }
+}
+
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+__global__ void ScalarKernel(const InT *__restrict__ in0,
+                             const InT *__restrict__ in1, OutT *out, int size,
+                             Functor func) {
+  auto data = ElementwiseDataWrapper<ET, 1, InT, OutT>(out, in0, in1);
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int remain = tid < size ? 1 : 0;
+  ScalarKernelImpl(data, func, tid, remain);
+}
+
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
+void LaunchElementwiseCudaKernel(
+    const platform::CUDADeviceContext &ctx,
+    const std::vector<const framework::Tensor *> &ins,
+    std::vector<framework::Tensor *> *outs, Functor func) {
+  // calculate the max vec_size for all ins and outs
+  auto size = ins[0]->numel();
+  int vec_size = GetVectorizedSize<InT, OutT>(ins, *outs);
+  int block_size = ELEMENTWISE_BLOCK_SIZE;
+  int grid_size =
+      ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
+  const InT *in0 = ins[0]->data<InT>();
+  const InT *in1 =
+      (ET == ElementwiseType::kBinary) ? ins[1]->data<InT>() : nullptr;
+  OutT *out = (*outs)[0]->data<OutT>();
+  // cuda kernel
+  auto stream = ctx.stream();
+  switch (vec_size) {
+    case 4:
+      VectorizedKernel<ET, 4><<<grid_size, block_size, 0, stream>>>(
+          in0, in1, out, size, func);
+      break;
+    case 2:
+      VectorizedKernel<ET, 2><<<grid_size, block_size, 0, stream>>>(
+          in0, in1, out, size, func);
+      break;
+    case 1:
+      ScalarKernel<ET><<<grid_size, block_size, 0, stream>>>(in0, in1, out,
+                                                             size, func);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f06dbd26873a606ce3a834efa9d1bb0de5814ff7
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -0,0 +1,183 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(elementwise_add);
+USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
+USE_OP(elementwise_sub);
+USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
+
+template <typename T>
+void Compare(f::Scope *scope, const p::DeviceContext &ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto y = scope->Var("Y");
+  auto tensor_y = y->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  std::vector<T> init_y;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_y.push_back(static_cast<T>(2.0));
+  }
+
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+  TensorFromVector(init_y, ctx, tensor_y);
+  tensor_y->Resize({10, 10});
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Y", {"Y"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+  float expected;
+  if (op_type == "elementwise_add") {
+    expected = 3.0;
+  } else if (op_type == "elementwise_sub") {
+    expected = -1.0;
+  }
+  EXPECT_EQ(out_vec.size(), init_x.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
+  }
+}
+
+template <typename T>
+void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
+                 std::string op_type) {
+  // init
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+  tensor_dout->Resize({2, 3, 5});
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  tensor_x->Resize({2, 3, 5});
+
+  auto y = scope->Var("Y");
+  auto tensor_y = y->GetMutable<f::LoDTensor>();
+  tensor_y->Resize({1, 5});
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  auto dy = scope->Var("DY");
+  auto tensor_dy = dy->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_dout;
+  for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
+    init_dout.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  tensor_dout->Resize({2, 3, 5});
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
+      {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  std::vector<T> dy_vec;
+  TensorToVector(*tensor_dy, ctx, &dy_vec);
+
+  ctx.Wait();
+  float expected_x, expected_y;
+  if (op_type == "elementwise_add_grad") {
+    expected_x = 1.0;
+    expected_y = 6.0;
+  } else if (op_type == "elementwise_sub_grad") {
+    expected_x = 1.0;
+    expected_y = -6.0;
+  }
+
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
+  }
+  for (uint32_t i = 0; i < dy_vec.size(); i++) {
+    EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
+  }
+}
+
+TEST(elementwise_add, NPU_fp32) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "elementwise_add");
+}
+
+TEST(elementwise_sub, NPU_fp32) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "elementwise_sub");
+}
+
+TEST(elementwise_sub, NPU_fp16) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx, "elementwise_sub");
+}
+
+TEST(elementwise_sub_grad, NPU) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "elementwise_sub_grad");
+}
+
+TEST(elementwise_add_grad, NPU) {
+  f::Scope scope;
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "elementwise_add_grad");
+}
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26cc925b869c647d5a02215c8c8621782cdf2303
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ElementwisePowNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ElementwisePowNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 1996cc471ac2a070990996e9320890b7efd33143..192999fd2ac831e85d42a41e5a54754a49f4ddce 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -43,7 +43,7 @@ struct SameDimsElemwiseSub<platform::CUDADeviceContext, platform::float16> {
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
+    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
                               PADDLE_CUDA_THREAD_SIZE,
                           1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a6e438f8016e0cd4c8fccee6c664d509b8c170eb
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -0,0 +1,171 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
+    // default axis=-1?
+    // So, the sub_grad should do reduce if needed.
+    // For example, the shape of each variable in elementwise_sub:
+    // x, dx: [2, 3, 5]
+    // y, dy: [1, 5]
+    // out, dout: [2, 3, 5]
+    // Then, out = x - y  =>  dx = dout, dy = -dout
+    // And, the shape of dy can be computed by two stages reduce,
+    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
+    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
+
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // For dx
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dx->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dx->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dx->dims().size(); ++i) {
+        if (dx->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        framework::TensorCopy(
+            *tmp_dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dx);
+      }
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      // For dy
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dy->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dy(dy->type());
+      Tensor reduced_dout(dy->type());
+
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      Tensor* tmp_dy = tmp_dout;
+      for (auto i = 0; i < dy->dims().size(); ++i) {
+        if (dy->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        reduced_dy.Resize(dy->dims());
+        reduced_dy.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+        tmp_dy = &reduced_dy;
+      }
+
+      // stage 3, negative
+      auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<float>,
+                       ops::ElementwiseSubNPUKernel<plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
+                       ops::ElementwiseSubGradNPUKernel<float>,
+                       ops::ElementwiseSubGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
index 4db4adfe9e9ac4db1803827b4fa35b381ff4c19e..8f519de075760ebfebe82b3aa27f125bbd584301 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc
@@ -61,6 +61,9 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
                                            platform::EventRole::kUniqueOp);
       reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
       astream.wait();
+
+      dx->set_layout(DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
     }
 
     if (dy) {
@@ -75,17 +78,26 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         reorder_p->execute(astream, *reorder_src_memory_p,
                            *reorder_dst_memory_p);
         astream.wait();
+
+        dy->set_layout(DataLayout::kMKLDNN);
+        dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
       } else {
         // Broadcasting
         platform::ReductionMKLDNNHandler<T> handler_sum(
             dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
             ctx.GetPlace(), dout, dy,
-            ctx.InputName(framework::GradVarName("Out")));
+            ctx.InputName(framework::GradVarName("Out")),
+            CalculateBroadcastedDims(dout, dy));
         auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
         auto reduction_p = handler_sum.AcquireForwardPrimitive();
         reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
                                        {DNNL_ARG_DST, *dy_memory_p}});
         astream.wait();
+
+        dy->set_layout(DataLayout::kMKLDNN);
+        dy->set_format(
+            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
+                paddle::framework::vectorize<int64_t>(dy->dims()))));
       }
     }
   }
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 8a646e5865d922652f7f01509e7e8bbf06da48ea..e5d20893335f702c0188ff7a8deaa2b41b848b85 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -15,7 +15,6 @@
 #pragma once
 #include <string>
 #include <unordered_map>
-#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
@@ -82,5 +81,20 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     z->set_format(platform::GetMKLDNNFormat(*dst_memory));
   }
 };
+
+inline std::vector<int64_t> CalculateBroadcastedDims(const Tensor* x,
+                                                     const Tensor* y) {
+  const auto src_tz = framework::vectorize(x->dims());
+  const auto dst_tz = framework::vectorize(y->dims());
+
+  size_t j = 0;
+  std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
+  for (size_t i = 0; i < src_tz.size(); ++i) {
+    dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
+    if (j == dst_tz.size()) break;
+  }
+
+  return dst_tz_ex;
+}
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
index 293b5a1a2d31b41b3afc97db9d8001437b84c475..1c246e8d18937087639129d32001a297eec3ca42 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc
@@ -14,6 +14,119 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
 
+namespace paddle {
+namespace framework {
+class ExecutionContext;
+}  // namespace framework
+namespace platform {
+class CPUDeviceContext;
+struct CPUPlace;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElemwiseGradKernel<T>::Compute(ctx);
+
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    if (dx) {
+      // dx = dout*y
+      platform::BinaryMKLDNNHandler<T> handler(
+          dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
+          ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f,
+          ctx.InputName(framework::GradVarName("Out")));
+
+      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
+      const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
+      const auto dst_dx_memory = handler.AcquireDstMemory(dx);
+
+      const auto binary_prim = handler.AcquireForwardPrimitive();
+
+      const std::unordered_map<int, dnnl::memory> args = {
+          {DNNL_ARG_SRC_0, *src_dout_memory},
+          {DNNL_ARG_SRC_1, *src_y_memory},
+          {DNNL_ARG_DST, *dst_dx_memory}};
+
+      binary_prim->execute(astream, args);
+      astream.wait();
+
+      dx->set_layout(framework::DataLayout::kMKLDNN);
+      dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory));
+    }
+
+    if (dy) {
+      // dy = dout*x
+      // Handler is having nullptr passed instead of output tensor as
+      // we want Dst buffer to be allocated by oneDNN not to use Tensor
+      platform::BinaryMKLDNNHandler<T> handler(
+          dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine,
+          ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f,
+          ctx.InputName(framework::GradVarName("Out")));
+
+      const auto src_dout_memory = handler.AcquireSrcMemory(dout);
+      const auto src_x_memory = handler.AcquireSecondSrcMemory(x);
+
+      // If broadcasting is in use then let's write to temporary
+      // buffer allocated by oneDNN
+      const auto dst_dy_memory = (dout->dims() == dy->dims())
+                                     ? handler.AcquireDstMemory(dy)
+                                     : handler.AcquireDstMemory();
+
+      const auto binary_prim = handler.AcquireForwardPrimitive();
+
+      const std::unordered_map<int, dnnl::memory> args = {
+          {DNNL_ARG_SRC_0, *src_dout_memory},
+          {DNNL_ARG_SRC_1, *src_x_memory},
+          {DNNL_ARG_DST, *dst_dy_memory}};
+
+      binary_prim->execute(astream, args);
+      astream.wait();
+
+      dy->set_layout(framework::DataLayout::kMKLDNN);
+
+      // Reduction is needed for broadcasting scenario
+      if (dout->dims() != dy->dims()) {
+        platform::ReductionMKLDNNHandler<T> handler_sum(
+            dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, mkldnn_engine,
+            ctx.GetPlace(), dout, dy,
+            ctx.InputName(framework::GradVarName("Out")),
+            CalculateBroadcastedDims(dout, dy));
+        auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
+        auto reduction_p = handler_sum.AcquireForwardPrimitive();
+        // As source we use mem object with results from binary operation
+        reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory},
+                                       {DNNL_ARG_DST, *dy_memory_p}});
+        astream.wait();
+        dy->set_format(
+            platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape(
+                paddle::framework::vectorize<int64_t>(dy->dims()))));
+
+      } else {
+        dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(
@@ -23,3 +136,7 @@ REGISTER_OP_KERNEL(
                              dnnl::algorithm::binary_mul>,
     ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_mul>,
     ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_mul>)
+
+REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::EltwiseMulMKLDNNGradKernel<paddle::platform::bfloat16>,
+                   ops::EltwiseMulMKLDNNGradKernel<float>)
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 25b83ed93f72969b4a12a6cbda1fd06c01e98c96..e2bf61de63196bd814c2fa237774f262f9f88208 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -147,3 +147,17 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand_as, ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_grad,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, double>);
+#endif
diff --git a/paddle/fluid/operators/expand_as_op.cu b/paddle/fluid/operators/expand_as_op.cu
deleted file mode 100755
index dbb1fcf3ab32619b73363bf064cb1cc286efde6e..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/expand_as_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_as_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    expand_as, ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_grad,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
old mode 100644
new mode 100755
index cbaeb0c4e42564bedfcacdf26a73b7d9fda908a3..406455af741715f9188b02649cc976ca5562e3b5
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
@@ -22,9 +22,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define EXPAND_AS_TEMPLATE(z, n, data) \
   case n + 1: {                        \
     ExpandAs<n + 1>(context);          \
@@ -32,10 +40,10 @@ limitations under the License. */
   }
 #define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_AS_GRAD_CASE(n)                                       \
-  case n: {                                                          \
-    ExpandAsBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                           \
+#define EXPAND_AS_GRAD_CASE(n)                                           \
+  case n + 1: {                                                          \
+    ExpandAsBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                               \
   }
 #define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), )
@@ -75,7 +83,7 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     auto in_dims = in0->dims();
     auto* target_tensor = context.Input<Tensor>("target_tensor");
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     int bcast_dims_remainder = 0;
     auto x_dims = in0->dims();
     auto y_dims = target_tensor->dims();
@@ -104,7 +112,8 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     auto y = EigenTensor<T, Rank>::From(*out0);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    y.device(place) = x.broadcast(bcast_dims);
+    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                 bcast_dims);
   }
 };
 
@@ -143,6 +152,18 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
       framework::TensorCopy(*in0, context.GetPlace(), context.device_context(),
                             out0);
     } else {
+      PADDLE_ENFORCE_GE(dims, 1,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_as_grad op must be greater than or "
+                            "equal to 1, but the value received is %d.",
+                            dims));
+      PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED,
+                        platform::errors::InvalidArgument(
+                            "The rank of the input 'Out@GRAD' for "
+                            "expand_as_grad op must be less than or equal "
+                            "to %d, but the value received is %d.",
+                            MAX_RANK_SUPPORTED, dims));
       switch (dims) {
         REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
         default:
@@ -165,20 +186,19 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 70099afbd5994dc599fd50c80524494f6056b345..5296a144f6247db18fc866febac39779d4a317b3 100644
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -129,3 +129,18 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2_grad,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
+#endif
diff --git a/paddle/fluid/operators/expand_as_v2_op.cu b/paddle/fluid/operators/expand_as_v2_op.cu
deleted file mode 100644
index e315144472dd9fd4095043e4800a3f276d9314c7..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/expand_as_v2_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_as_v2_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
old mode 100644
new mode 100755
index c36e461926f5c15e9e7ab2e853ff4108f44c5dd7..6df4c592378cb24b4aa2557d768e6ae7ad34a4a4
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -23,9 +23,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define EXPAND_AS_TEMPLATE(z, n, data) \
   case n + 1: {                        \
     ExpandAs<n + 1>(context);          \
@@ -33,10 +41,10 @@ limitations under the License. */
   }
 #define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_AS_GRAD_CASE(n)                                       \
-  case n: {                                                          \
-    ExpandAsBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                           \
+#define EXPAND_AS_GRAD_CASE(n)                                           \
+  case n + 1: {                                                          \
+    ExpandAsBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                               \
   }
 #define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), )
@@ -108,7 +116,7 @@ class ExpandAsV2Kernel : public framework::OpKernel<T> {
       }
     }
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       bcast_dims[i] = repeat_times[i];
     }
@@ -122,7 +130,8 @@ class ExpandAsV2Kernel : public framework::OpKernel<T> {
     auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    y.device(place) = x.broadcast(bcast_dims);
+    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                 bcast_dims);
   }
 };
 
@@ -176,7 +185,14 @@ class ExpandAsV2GradKernel : public framework::OpKernel<T> {
                             "expand_as_v2_grad op must be less than or equal "
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
-      switch (dims) { REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+      switch (dims) {
+        REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
+      }
     }
   }
 
@@ -191,20 +207,19 @@ class ExpandAsV2GradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 83e205367a7af62c52825297d92571c306be2c42..e7da08ff2771178afd46dab7db4b3be631c9c110 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -273,3 +273,21 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::float16>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu
deleted file mode 100644
index f2f8e2f7414f38bb623ad312168b5958167557bc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/expand_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
old mode 100644
new mode 100755
index 8b79a1feb8ce1f7ee5b6237b4437ea67fa33ee11..e566d69096595ce5ea9e753b58a2bf3e923a9c10
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -25,9 +25,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define EXPAND_TEMPLATE(z, n, data) \
   case n + 1: {                     \
     Expand<n + 1>(context);         \
@@ -35,10 +43,10 @@ limitations under the License. */
   }
 #define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_GRAD_CASE(n)                                        \
-  case n: {                                                        \
-    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                         \
+#define EXPAND_GRAD_CASE(n)                                            \
+  case n + 1: {                                                        \
+    ExpandBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                             \
   }
 #define EXPAND_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
@@ -56,6 +64,12 @@ inline std::vector<int> get_expand_times(
       TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
       expand_data = cpu_expand_tensor.data<int>();
     }
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(expand_tensor->place())) {
+      TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
+      expand_data = cpu_expand_tensor.data<int>();
+    }
+#endif
 #ifdef PADDLE_WITH_XPU
     if (platform::is_xpu_place(expand_tensor->place())) {
       TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor);
@@ -141,7 +155,7 @@ class ExpandKernel : public framework::OpKernel<T> {
             "of dimensions (%d) of the input.",
             expand_times.size(), static_cast<size_t>(in_dims.size())));
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < expand_times.size(); ++i) {
       bcast_dims[i] = expand_times[i];
     }
@@ -160,9 +174,11 @@ class ExpandKernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -216,7 +232,14 @@ class ExpandGradKernel : public framework::OpKernel<T> {
                             "for Op(expand_grad) must be less than or equal "
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
-      switch (dims) { REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
+      }
     }
   }
 
@@ -241,20 +264,19 @@ class ExpandGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb3a6512d2c8ba3b5f0d643a5ae6d906a00717c3
--- /dev/null
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ExpandNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'x' for Op(expand) "
+            "must be greater than or equal to 1, but the value received is %d.",
+            rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number of dimensions of the input 'x' for Op(expand) "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<framework::LoDTensor>("X");
+    auto in_dims = in0->dims();
+    auto expand_times = get_expand_times(context);
+    PADDLE_ENFORCE_EQ(
+        static_cast<size_t>(in_dims.size()), expand_times.size(),
+        platform::errors::InvalidArgument(
+            "The number of elements (%d) of 'expand_times' for "
+            "Op(expand) must be equal to the number "
+            "of dimensions (%d) of the input.",
+            expand_times.size(), static_cast<size_t>(in_dims.size())));
+    auto* out0 = context.Output<framework::LoDTensor>("Out");
+    framework::DDim out_dims(in_dims);
+
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      out_dims[i] *= expand_times[i];
+    }
+
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(context.device_context().GetPlace());
+    auto runner =
+        NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    expand, ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
+                         paddle::platform::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..880eb341f2093b1a2bae4aea06b416b42e90d30e
--- /dev/null
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <iostream>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(expand);
+USE_OP_DEVICE_KERNEL(expand, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto in = scope->Var("X");
+  auto expand_times = scope->Var("ExpandTimes");
+  auto out = scope->Var("Out");
+  auto in_t = in->GetMutable<f::LoDTensor>();
+  auto out_t = out->GetMutable<f::LoDTensor>();
+  auto expand_times_t = expand_times->GetMutable<f::LoDTensor>();
+
+  auto place = ctx.GetPlace();
+  TensorFromVector(std::vector<T>(3 * 1 * 7, 1), ctx, in_t);
+  TensorFromVector(std::vector<int>({1, 10, 1}), ctx, expand_times_t);
+
+  in_t->Resize(f::make_ddim({3, 1, 7}));
+  expand_times_t->Resize(f::make_ddim({3}));
+  out_t->Resize(f::make_ddim({3, 10, 7}));
+  out_t->mutable_data<T>(place);
+
+  f::AttributeMap attrs = {{}};
+  auto op = f::OpRegistry::CreateOp(
+      "expand", {{"X", {"X"}}, {"ExpandTimes", {"ExpandTimes"}}},
+      {{"Out", {"Out"}}}, attrs);
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  auto out_dim = out_t->dims();
+  EXPECT_EQ(out_dim.at(0), 3);
+  EXPECT_EQ(out_dim.at(1), 10);
+  EXPECT_EQ(out_dim.at(2), 7);
+}
+
+TEST(expand, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 05ab0f6c8dc8fc5d18b8eee7ee3294b91bc2d3e7..618c1560c5eac709e32f928f4142cf159ac5c39d 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -278,3 +278,21 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2_grad,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext,
+                            paddle::platform::float16>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/expand_v2_op.cu b/paddle/fluid/operators/expand_v2_op.cu
deleted file mode 100644
index e096dbc27f0c2ae8142da40b9db99074b2719387..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/expand_v2_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_v2_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_v2_grad,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
old mode 100644
new mode 100755
index ec9c6e62f272ed87abc4e0be6ccf1de3aedf15d4..8a87a067c51f1147108eec26278ce823d57656b9
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -26,9 +26,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define EXPAND_TEMPLATE(z, n, data) \
   case n + 1: {                     \
     Expand<n + 1>(context);         \
@@ -36,10 +44,10 @@ limitations under the License. */
   }
 #define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_GRAD_CASE(n)                                        \
-  case n: {                                                        \
-    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                         \
+#define EXPAND_GRAD_CASE(n)                                            \
+  case n + 1: {                                                        \
+    ExpandBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                             \
   }
 #define EXPAND_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
@@ -174,7 +182,7 @@ class ExpandV2Kernel : public framework::OpKernel<T> {
     }
 
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       bcast_dims[i] = repeat_times[i];
     }
@@ -194,9 +202,11 @@ class ExpandV2Kernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -260,7 +270,14 @@ class ExpandV2GradKernel : public framework::OpKernel<T> {
                             "expand_v2_grad op must be less than or equal "
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
-      switch (dims) { REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
+      }
     }
   }
 
@@ -275,20 +292,19 @@ class ExpandV2GradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index abfc88e5155e5a2b954eb9c6a1e27f4a9b668a72..4544386718813c1344f7e3a933b036cae08a7df9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -649,13 +649,18 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel {
                    "MovingAverageAbsMaxScale");
     OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
                    "MovingAverageAbsMaxScale");
+
     if (ctx->HasOutput("OutState")) {
       ctx->SetOutputDim("OutState", {1});
     }
     if (ctx->HasOutput("OutAccum")) {
       ctx->SetOutputDim("OutAccum", {1});
     }
-    ctx->SetOutputDim("OutScale", {1});
+    if (ctx->HasOutput("Out")) {
+      ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+      ctx->SetOutputDim("OutScale", {1});
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
   }
 
  protected:
@@ -673,6 +678,9 @@ class MovingAverageAbsMaxScaleOpMaker
     AddInput("X", "(Tensor) Input is float data type.");
     AddInput("InAccum", "Last accum.").AsDispensable();
     AddInput("InState", "Last state.").AsDispensable();
+    AddOutput("Out",
+              "(Tensor) Output tensor is just equivalent to the input tensor.")
+        .AsDispensable();
     AddOutput("OutScale", " Current scale");
     AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
     AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
@@ -693,7 +701,7 @@ $$Out = X$$
   }
 };
 
-class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
+class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -701,9 +709,9 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
     auto out_grad_name = framework::GradVarName("Out");
     auto x_grad_name = framework::GradVarName("X");
     OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
-                   "FakeQuantDequantGradOp");
+                   "StrightThroughEstimatorGradOp");
     OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
-                   "FakeQuantDequantGradOp");
+                   "StrightThroughEstimatorGradOp");
 
     ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name));
   }
@@ -717,13 +725,13 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel {
 };
 
 template <typename T>
-class FakeQuantDequantGradMaker : public framework::SingleGradOpMaker<T> {
+class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker<T> {
  public:
   using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 
  protected:
   void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("fake_quantize_dequantize_grad");
+    grad_op->SetType("stright_throuth_estimator_grad");
     grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     grad_op->SetAttrMap(this->Attrs());
@@ -744,11 +752,11 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max,
                        ops::FakeQuantizeAbsMaxKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize_dequantize_abs_max,
-                  ops::FakeQuantOrWithDequantAbsMaxOp,
-                  ops::FakeQuantOrWithDequantAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    fake_quantize_dequantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp,
+    ops::FakeQuantOrWithDequantAbsMaxOpMaker,
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max,
                        ops::FakeQuantizeDequantizeAbsMaxKernel<CPU, float>);
 
@@ -769,11 +777,12 @@ REGISTER_OPERATOR(
 REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
                        ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize_dequantize_moving_average_abs_max,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
-                  ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    fake_quantize_dequantize_moving_average_abs_max,
+    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp,
+    ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker,
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     fake_quantize_dequantize_moving_average_abs_max,
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CPU, float>);
@@ -789,20 +798,22 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max,
 REGISTER_OPERATOR(
     moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleOp,
     ops::MovingAverageAbsMaxScaleOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
                        ops::MovingAverageAbsMaxScaleKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
-REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
-                       ops::FakeQuantDequantGradKernel<CPU, float>);
+REGISTER_OPERATOR(stright_throuth_estimator_grad,
+                  ops::StrightThroughEstimatorGradOp);
+REGISTER_OP_CPU_KERNEL(stright_throuth_estimator_grad,
+                       ops::StrightThroughEstimatorGradKernel<CPU, float>);
 
-REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
-                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
-                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
-                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
-                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
+    ops::StrightThroughEstimatorMaker<paddle::framework::OpDesc>,
+    ops::StrightThroughEstimatorMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     fake_channel_wise_quantize_dequantize_abs_max,
     ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
@@ -820,4 +831,8 @@ REGISTER_OP_VERSION(moving_average_abs_max_scale)
             "Out",
             "Delete output in order to make the inference model not "
             "save moving_average_abs_max_scale operator. This will "
-            "make the quantitative model be correctly applied in inference."));
+            "make the quantitative model be correctly applied in inference."))
+    .AddCheckpoint(
+        R"ROC(Incompatible upgrade of output [Out])ROC",
+        paddle::framework::compatible::OpVersionDesc().NewOutput(
+            "Out", "In order to support dygraph qat, add output again."));
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 92127f9aebd0d5ca2786d4876082f84a2ccb0885..78052179f6be72c39d7d78aab5237ab6beb8c645 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -543,8 +543,8 @@ REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale,
 REGISTER_OP_CUDA_KERNEL(
     fake_quantize_dequantize_moving_average_abs_max,
     ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
-REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
-                        ops::FakeQuantDequantGradKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(stright_throuth_estimator_grad,
+                        ops::StrightThroughEstimatorGradKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(
     fake_channel_wise_quantize_dequantize_abs_max,
     ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 94a75f930bebae3749acc3018325c6fb19318502..11a2d2de8bcf7353753fb788ef619c040866d853 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -314,6 +314,12 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
     auto* in = context.Input<framework::Tensor>("X");
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
+    if (context.HasOutput("Out")) {
+      auto* out = context.Output<framework::Tensor>("Out");
+      out->mutable_data<T>(context.GetPlace());
+      framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out);
+    }
+
     bool is_test = context.Attr<bool>("is_test");
     // testing
     if (is_test) {
@@ -344,17 +350,17 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T>
-class FakeQuantDequantGradKernel : public framework::OpKernel<T> {
+class StrightThroughEstimatorGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* d_out =
         context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
     auto x_grad_name = framework::GradVarName("X");
     auto* d_x = context.Output<framework::LoDTensor>(x_grad_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        d_x, platform::errors::PreconditionNotMet(
-                 "FakeQuantDequantGradOp doesn't have the output named %s.",
-                 x_grad_name));
+    PADDLE_ENFORCE_NOT_NULL(d_x, platform::errors::PreconditionNotMet(
+                                     "StrightThroughEstimatorGradKernel "
+                                     "doesn't have the output named %s.",
+                                     x_grad_name));
 
     // Initialize dx as same as d_out
     d_x->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 8a96d057cbe039d1577d4210c6df747d54796267..f35d8b6bbf89f188864e37fb267101333163cd41 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -149,10 +149,12 @@ REGISTER_OPERATOR(
 
 REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                        ops::FillConstantKernel<double>,
+                       ops::FillConstantKernel<uint8_t>,
                        ops::FillConstantKernel<int64_t>,
                        ops::FillConstantKernel<int>,
                        ops::FillConstantKernel<bool>,
                        ops::FillConstantKernel<paddle::platform::float16>,
+                       ops::FillConstantKernel<paddle::platform::bfloat16>,
                        ops::FillConstantKernel<paddle::platform::complex64>,
                        ops::FillConstantKernel<paddle::platform::complex128>);
 
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
index 78c62a4053b6413a892ad761618bfc787b1b9609..e784c20b8b8b4f9fa61b3bcebf481a989d4bb033 100644
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                         ops::FillConstantKernel<double>,
+                        ops::FillConstantKernel<uint8_t>,
                         ops::FillConstantKernel<int64_t>,
                         ops::FillConstantKernel<int>,
                         ops::FillConstantKernel<bool>,
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 4608f167548a3820a601b924100ff6234a999d6b..17c7321122b174226010810b9223770ed2b84a7e 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -105,7 +105,8 @@ class FillConstantKernel : public framework::OpKernel<T> {
     int actual_place = place_type;
 
     if (actual_place == -1) {
-      bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
+      bool cpu_place = (force_cpu || ctx.GetPlace() == platform::CPUPlace() ||
+                        data_type == framework::proto::VarType::BF16);
       if (cpu_place) {
         actual_place = 0;
       } else if (platform::is_gpu_place(ctx.GetPlace())) {
@@ -116,6 +117,9 @@ class FillConstantKernel : public framework::OpKernel<T> {
     }
 
     if (actual_place == 0) {
+      VLOG(4) << "[CPU] FillConstantKernel"
+              << ((data_type == framework::proto::VarType::BF16) ? "<bfloat16>"
+                                                                 : "<T>");
       tensor->mutable_data(platform::CPUPlace(), data_type);
       math::SetConstant<platform::CPUDeviceContext, T> functor;
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ea4c11c478357aa7ca98fc0de4467bae7100a87
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class FillConstantNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto str_value = ctx.Attr<std::string>("str_value");
+    auto float_value = ctx.Attr<float>("value");
+
+    auto* out_var = ctx.Output<framework::Tensor>("Out");
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    T value;
+    if (str_value.empty()) {
+      value = static_cast<T>(float_value);
+    } else {
+      // handle NaN/Inf first, which cannot be read from stream.
+      if (str_value == "inf") {
+        value = static_cast<T>(std::numeric_limits<double>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<T>(-std::numeric_limits<double>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
+      } else {
+        std::stringstream convert_stream(str_value);
+        if (std::is_same<int64_t, T>::value) {
+          int64_t tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        } else {
+          double tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        }
+      }
+    }
+    auto shape = GetShape(ctx);
+
+    Tensor tensor_tmp(data_type);
+    tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<T>(&tensor_tmp, value);
+
+    out_var->mutable_data<T>(shape, place);
+    auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
+                              {{"dims", framework::vectorize(shape)}});
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    fill_constant,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::FillConstantNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
index d23beea7e4e62ee65f31c4dc903d80310ddfccbc..c94ce4174f2be32beae0547f6a8366fd2896e027 100644
--- a/paddle/fluid/operators/flatten_op.cc
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -429,6 +429,7 @@ REGISTER_OPERATOR(flatten_contiguous_range_grad,
 REGISTER_OP_CPU_KERNEL(
     flatten, ops::FlattenKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FlattenKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FlattenKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::FlattenKernel<paddle::platform::CPUDeviceContext, int>,
     ops::FlattenKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::FlattenKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -436,12 +437,14 @@ REGISTER_OP_CPU_KERNEL(
     flatten_grad,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::FlattenGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     flatten2, ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, float>,
     ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int>,
     ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::Flatten2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -449,6 +452,7 @@ REGISTER_OP_CPU_KERNEL(
     flatten2_grad,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::Flatten2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
@@ -458,6 +462,8 @@ REGISTER_OP_CPU_KERNEL(
                                       float>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
                                       double>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
+                                      uint8_t>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext, int>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CPUDeviceContext,
                                       int8_t>,
@@ -469,6 +475,8 @@ REGISTER_OP_CPU_KERNEL(
                                           float>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
                                           double>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
+                                          uint8_t>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
                                           int>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CPUDeviceContext,
diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc
index 40fda804eaab9d280fb91c97fb4c4983a28487d0..223cfc6ac667de8a778ea55e495dd6a626c2efd2 100644
--- a/paddle/fluid/operators/flatten_op.cu.cc
+++ b/paddle/fluid/operators/flatten_op.cu.cc
@@ -19,6 +19,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     flatten, ops::FlattenKernel<paddle::platform::CUDADeviceContext, float>,
     ops::FlattenKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FlattenKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::FlattenKernel<paddle::platform::CUDADeviceContext, int>,
     ops::FlattenKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::FlattenKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -26,12 +27,14 @@ REGISTER_OP_CUDA_KERNEL(
     flatten_grad,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::FlattenGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
     flatten2, ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, float>,
     ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int>,
     ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Flatten2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -39,6 +42,7 @@ REGISTER_OP_CUDA_KERNEL(
     flatten2_grad,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::Flatten2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
@@ -48,6 +52,8 @@ REGISTER_OP_CUDA_KERNEL(
                                       float>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
                                       double>,
+    ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
+                                      uint8_t>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext, int>,
     ops::FlattenContiguousRangeKernel<paddle::platform::CUDADeviceContext,
                                       int8_t>,
@@ -59,6 +65,8 @@ REGISTER_OP_CUDA_KERNEL(
                                           float>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
                                           double>,
+    ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
+                                          uint8_t>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
                                           int>,
     ops::FlattenContiguousRangeGradKernel<paddle::platform::CUDADeviceContext,
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 1b2f1db1b07cdd883417fb5f98e4c685fe32c515..efcb0cbe2e2a8d8bbf964cc4f2d2496e6a6fa991 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -120,23 +120,9 @@ template <typename DeviceContext, typename T>
 class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto &start_axis = context.Attr<int>("start_axis");
-    auto &stop_axis = context.Attr<int>("stop_axis");
-
     auto *in = context.Input<framework::LoDTensor>("X");
-    auto x_dims = in->dims();
-    int in_dims_size = x_dims.size();
-    int real_start_axis = start_axis, real_stop_axis = stop_axis;
-    if (start_axis < 0) {
-      real_start_axis = start_axis + in_dims_size;
-    }
-    if (stop_axis < 0) {
-      real_stop_axis = stop_axis + in_dims_size;
-    }
     auto *out = context.Output<framework::LoDTensor>("Out");
-
-    auto out_dims = framework::make_ddim(
-        GetOutputShape(real_start_axis, real_stop_axis, x_dims));
+    auto out_dims = out->dims();
 
     out->mutable_data(context.GetPlace(), in->type());
     framework::TensorCopy(
@@ -144,27 +130,6 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
         context.template device_context<platform::DeviceContext>(), out);
     out->Resize(out_dims);
   }
-  static std::vector<int32_t> GetOutputShape(const int start_axis,
-                                             const int stop_axis,
-                                             const framework::DDim &in_dims) {
-    int64_t outer = 1;
-    std::vector<int32_t> out_shape;
-    int in_dims_size = in_dims.size();
-    out_shape.reserve(in_dims_size - stop_axis + start_axis);
-
-    for (int i = 0; i < start_axis; ++i) {
-      out_shape.push_back(in_dims[i]);
-    }
-    for (int i = start_axis; i <= stop_axis; i++) {
-      outer *= in_dims[i];
-    }
-    out_shape.push_back(outer);
-    for (int i = stop_axis + 1; i < in_dims_size; i++) {
-      out_shape.push_back(in_dims[i]);
-    }
-
-    return out_shape;
-  }
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 287827ced5115e1043f033fc966b0944f46494b1..104298e037319c6fbbfc8da830543fe06eb4dcac 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -32,8 +32,7 @@ if (WITH_GPU OR WITH_ROCM)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_batch_norm_act);\n")
     endif()
     # conv_fusion_op needs cudnn 7 above
-    # HIP not support cudnnConvolutionBiasActivationForward
-    if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
         op_library(conv_fusion_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
     endif()
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 33d408582ff48504ed7fce2950934fcd43cabc90..f5ee7f559918457c600324bf2d24daa247c938da 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -18,14 +18,18 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/math/padding.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/miopen_helper.h"
+#else
 #include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 
 DECLARE_int64(cudnn_exhaustive_search_times);
 
 namespace paddle {
 namespace operators {
 
-#if CUDNN_VERSION >= 7100
+#if PADDLE_WITH_HIP || CUDNN_VERSION >= 7100
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@@ -162,7 +166,78 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     if (input->dims().size() == 5) {
       layout = DataLayout::kNCDHW;
     }
+#ifdef PADDLE_WITH_HIP
+    miopenConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(padding_common, strides, dilations);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::miopenSetConvolutionGroupCount(cudnn_conv_desc,
+                                                          groups));
+    // Now only support NCHW
+    std::vector<int> bias_dim = {
+        1, static_cast<int>(transformed_output.dims()[1]), 1, 1};
+    miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize<int>(transformed_input.dims()));
+    miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize<int>(transformed_output.dims()));
+    miopenTensorDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize<int>(filter->dims()));
+    miopenTensorDescriptor_t cudnn_bias_desc =
+        bias_desc.descriptor<T>(layout, bias_dim);
+    miopenActivationDescriptor_t cudnn_act_desc =
+        act_desc.descriptor<T>(activation);
 
+    miopenConvFwdAlgorithm_t algo;
+    auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+
+    auto x_dims = framework::vectorize(transformed_input.dims());
+    auto f_dims = framework::vectorize(filter->dims());
+
+    size_t workspace_size = 0;
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
+            handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+            cudnn_output_desc, &workspace_size));
+    int find_count;
+    miopenConvAlgoPerf_t find_result;
+    auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::miopenFindConvolutionForwardAlgorithm(
+              handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+              filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
+              kNUM_CUDNN_FWD_ALGS, &find_count, &find_result,
+              cudnn_workspace_ptr, workspace_size, false));
+    };
+    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    algo = find_result.fwd_algo;
+    VLOG(3) << "cuDNN forward algo " << algo;
+
+    {
+      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, &beta, cudnn_output_desc,
+            output_data, cudnn_workspace, workspace_size));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::miopenConvolutionForwardBias(
+              handle, &alpha, cudnn_bias_desc, bias_data, &beta,
+              cudnn_output_desc, output_data));
+      if (activation != "identity") {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward(
+            handle, cudnn_act_desc, &alpha, cudnn_output_desc, output_data,
+            &beta, cudnn_output_desc, output_data));
+      }
+      if (residual) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+            handle, miopenTensorOpAdd, &alpha, cudnn_output_desc, output_data,
+            &alpha, cudnn_output_desc, residual_data, &beta, cudnn_output_desc,
+            output_data));
+      }
+    }
+#else  // PADDLE_WITH_HIP
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(padding_common, strides, dilations);
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -200,13 +275,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
         cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-#if CUDNN_VERSION >= 11000
+#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
     if (!platform::allow_tf32_cudnn) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(cudnn_conv_desc,
                                                          CUDNN_FMA_MATH));
     }
-#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
 
     auto x_dims = framework::vectorize(transformed_input.dims());
     auto f_dims = framework::vectorize(filter->dims());
@@ -327,6 +402,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
     }
+#endif
     std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
     if (channels.size()) {
       auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
@@ -358,8 +434,11 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-#if CUDNN_VERSION >= 7100
 namespace ops = paddle::operators;
+#if CUDNN_VERSION >= 7100
 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
                         ops::CUDNNConvFusionOpKernel<double>);
 #endif
+#ifdef PADDLE_WITH_HIP
+REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>);
+#endif
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index 9711cc8d811d5df33da393e3732ef36f2cc4e8a3..14a6608836a8a782bfcf545de92d7d213c8665fb 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -14,7 +14,6 @@
 
 #include <paddle/fluid/platform/device_context.h>
 #include <algorithm>
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
index c448c529f569158835020eec78d9092845247cdc..b3796f1df5fdf207b26bebfd89704d4387f0d256 100644
--- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
+++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu
@@ -153,13 +153,13 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
                                                          CUDNN_DEFAULT_MATH));
-#if CUDNN_VERSION >= 11000
+#if CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
       if (!platform::allow_tf32_cudnn) {
         PADDLE_ENFORCE_CUDA_SUCCESS(
             platform::dynload::cudnnSetConvolutionMathType(conv_desc[i],
                                                            CUDNN_FMA_MATH));
       }
-#endif  // CUDA_VERSION >= 11000
+#endif  // CUDA_VERSION >= 11000 && CUDNN_VERSION >= 8000
     }
     in_dims[2][1] *= 2;
     in_strides[2][0] = oc * h * w;
diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc
index 3c82be2c4e48d89e29b886a249625a431548ba88..6cca6b5a9729a7065e64771ec6bfb2b1cbb52cf5 100644
--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -249,6 +249,18 @@ void FusionLSTMOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<float>("Scale_data",
+                 "Scale to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(1.0f);
+  AddAttr<float>("Shift_data",
+                 "Shift to be used for int8 input/output data."
+                 "Only used with MKL-DNN INT8.")
+      .SetDefault(0.0f);
+  AddAttr<std::vector<float>>("Scale_weights",
+                              "Scale_weights to be used for int8 weights data."
+                              "Only used with MKL-DNN INT8.")
+      .SetDefault({1.0f});
   AddAttr<bool>("force_fp32_output",
                 "(bool, default false) Force INT8 kernel output FP32, only "
                 "used in MKL-DNN INT8")
diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
index cf39968a9004f950086488ff025ed3a9651ea319..1adbd5cd9e7bc5f3fb5c3dc36868467e2f0b6e4b 100644
--- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc
@@ -79,13 +79,11 @@ class LSTMMKLDNNHandler
                                    MKLDNNMemoryFormat::ldgo);
       auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
                                      MKLDNNMemoryFormat::tnc);
+
       auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
                                  MKLDNNMemoryFormat::ldnc);
-      auto c0_md = MKLDNNMemDesc(
-          {L, D, N, OC}, MKLDNNGetDataType<float>(),  // Vanilla LSTM and LSTM
-                                                      // with peepoles has c0 as
-                                                      // fp32
-          MKLDNNMemoryFormat::ldnc);
+      auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<float>(),
+                                 MKLDNNMemoryFormat::ldnc);
 
       // Create LSTM oneDNN primitive
       const auto direction =
@@ -266,7 +264,7 @@ class LSTMMKLDNNHandler
           this->fwd_pd_->src_iter_c_desc(), this->engine_);
 
       auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-      dnnl::reorder(user_c0_memory, *memory_p, this->attr_)
+      dnnl::reorder(user_c0_memory, *memory_p)
           .execute(astream, user_c0_memory, *memory_p);
 
       this->dev_ctx_.SetBlob(c0_key, memory_p);
@@ -360,6 +358,12 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
       weight_h_memory_p =
           handler.template AcquireWeightHMemory<paddle::platform::bfloat16>(
               weight_h);
+    } else {
+      h0_memory_p = handler.template AcquireH0Memory<uint8_t>(h0);
+      weight_x_memory_p =
+          handler.template AcquireWeightXMemory<int8_t>(weight_x);
+      weight_h_memory_p =
+          handler.template AcquireWeightHMemory<int8_t>(weight_h);
     }
 
     auto bias_memory_p = handler.AcquireBiasMemory(bias);
@@ -406,4 +410,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fusion_lstm, MKLDNN, paddle::platform::CPUPlace,
                    ops::FusionLSTMMKLDNNKernel<float>,
-                   ops::FusionLSTMMKLDNNKernel<paddle::platform::bfloat16>);
+                   ops::FusionLSTMMKLDNNKernel<paddle::platform::bfloat16>,
+                   ops::FusionLSTMMKLDNNKernel<uint8_t>);
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ee8889995f4d6045f237aa51e00faff7f67b2a3
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/gather_op.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/kron_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class GatherOpNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto runner = NpuOpRunner("Gather", {*x, *index}, {*out},
+                              {{"validate_indices", true}});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GatherGradOpNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    // step1: Unsqueeze index
+    framework::Tensor tmp_tensor(index->type());
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 1) {
+      tmp_tensor.ShareDataWith(*index);
+      std::vector<int64_t> new_dim = {index_dims[0], 1};
+      tmp_tensor.Resize(framework::make_ddim(new_dim));
+      index = &tmp_tensor;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // step2: ZerosLike x in device
+    Tensor zeroslike_xout(dx->type());
+    zeroslike_xout.Resize(x->dims());
+    auto p = zeroslike_xout.mutable_data<T>(ctx.GetPlace());
+
+    platform::NPUMemsetAsync(static_cast<void *>(p), 0,
+                             zeroslike_xout.numel() * sizeof(T), stream);
+
+    // step3: scatter(x_grad)
+    auto runner_scatter = NpuOpRunner(
+        "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
+    runner_scatter.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    gather, ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::GatherOpNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    gather_grad,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::GatherGradOpNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31e19d8f600c39427ccf83056faed47e192e8ea5
--- /dev/null
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/gather_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(gather);
+USE_OP_DEVICE_KERNEL(gather, NPU);
+USE_OP(gather_grad);
+USE_OP_DEVICE_KERNEL(gather_grad, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto index = scope->Var("Index");
+  auto tensor_index = index->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 1; i < 7; ++i) {
+    // 1,2,3,4,5,6
+    init_x.push_back(static_cast<T>(i));
+  }
+
+  // [[1, 2],[3, 4],[5, 6]]
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
+
+  std::vector<int> init_index = {1, 2};
+  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
+  tensor_index->Resize(paddle::framework::make_ddim({2}));
+
+  ctx.Wait();
+
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs = {{"validate_indices", true}};
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"X", {"X"}}, {"Index", {"Index"}}}, {{"Out", {"Out"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  // ref:https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/tensor/manipulation/gather_cn.html#gather
+  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
+    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
+  }
+  uint32_t expected_size = 4;
+  EXPECT_EQ((uint32_t)out_vec.size(), expected_size);
+
+  // {3, 4, 5, 6}
+  std::vector<T> expected_out_vec;
+  for (int64_t i = 3; i < 7; ++i) {
+    expected_out_vec.push_back(static_cast<T>(i));
+  }
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], expected_out_vec[i]);
+  }
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+                 std::string op_type) {
+  // init
+  auto index = scope->Var("Index");
+  auto tensor_index = index->GetMutable<f::LoDTensor>();
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  std::vector<int> init_index = {0, 1};
+  paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
+  tensor_index->Resize(paddle::framework::make_ddim({2}));
+
+  std::vector<T> init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
+
+  std::vector<T> init_dout = {5.0, 10.0, 2.0, 3.0};
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  tensor_dout->Resize(paddle::framework::make_ddim({2, 2}));
+
+  ctx.Wait();
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  // run
+  f::AttributeMap attrs;
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"X", {"X"}}, {"Index", {"Index"}}, {"Out@GRAD", {"DOut"}}},
+      {{"X@GRAD", {"DX"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  ctx.Wait();
+
+  uint32_t expected_size = 3 * 2;
+  EXPECT_EQ((uint32_t)dx_vec.size(), expected_size);
+
+  std::vector<T> expected_dx_vec = {5.0, 10.0, 2.0, 3.0, 0.0, 0.0};
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    VLOG(3) << "dx_vec[i]=" << dx_vec[i];
+    EXPECT_EQ(dx_vec[i], expected_dx_vec[i]);
+  }
+}
+
+TEST(gather, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "gather");
+}
+
+TEST(gather, NPU_fp16) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx, "gather");
+}
+
+TEST(gather_grad, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "gather_grad");
+}
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 7a0c93eb1b2eaa7afaae7f0a604a0da5ac0fd75d..453ae20656f1d63bc9a7b088ef6785cf03270c59 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -11,6 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56aa509177cfd3e5ecfd521e0b66fd72fc708c38
--- /dev/null
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/gelu_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class GeluNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GeluGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+
+    dx->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor out(x->type());
+    out.mutable_data<T>(x->dims(), place);
+    auto out_runner = NpuOpRunner("Gelu", {*x}, {out}, {});
+    out_runner.Run(stream);
+
+    auto dx_runner = NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {});
+    dx_runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    gelu, ops::GeluNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GeluNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    gelu_grad,
+    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::GeluGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..830dcd59839015cf4c32f9caaf0399f209b48a5b
--- /dev/null
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(gelu);
+USE_OP_DEVICE_KERNEL(gelu, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  f::AttributeMap attrs;
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+
+  auto op = f::OpRegistry::CreateOp("gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}},
+                                    attrs);
+  op->Run(*scope, place);
+
+  ctx.Wait();
+
+  // eval time
+  struct timeval start, end;
+  gettimeofday(&start, NULL);
+
+  for (int i = 0; i < 100; i++) {
+    op->Run(*scope, place);
+  }
+
+  ctx.Wait();
+
+  gettimeofday(&end, NULL);
+  int micros =
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
+  printf("used time: %d\n", micros / 100);
+
+  // eval value
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  float expected = 0.841192;
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_FLOAT_EQ(out_vec[i], static_cast<T>(expected));
+  }
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_dout;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_dout.push_back(static_cast<T>(1.0));
+  }
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  tensor_dout->Resize({10, 10});
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  f::AttributeMap attrs;
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+
+  auto op = f::OpRegistry::CreateOp("gelu_grad",
+                                    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}},
+                                    {{"X@GRAD", {"DX"}}}, attrs);
+  op->Run(*scope, place);
+
+  ctx.Wait();
+
+  // eval time
+  struct timeval start, end;
+  gettimeofday(&start, NULL);
+
+  for (int i = 0; i < 100; i++) {
+    op->Run(*scope, place);
+  }
+
+  ctx.Wait();
+
+  gettimeofday(&end, NULL);
+  int micros =
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
+  printf("used time: %d\n", micros / 100);
+
+  // eval value
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  float expected = 1.082964;
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    EXPECT_FLOAT_EQ(dx_vec[i], static_cast<T>(expected));
+  }
+}
+
+TEST(gelu, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
+
+TEST(gelu_grad, NPU) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 45d97723a3e21044daf1609b749a22ae08efad39..18a248f55314f7814c3363fd45181501f0aa8b48 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -174,7 +174,11 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
     int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
                                                    : x_dims[1] * x_dims[2]);
 
+#ifdef __HIPCC__
+    int block_size = std::max(std::min(256, imsize), 64);
+#else
     int block_size = std::min(1024, imsize);
+#endif
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
     GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
@@ -348,7 +352,11 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
     int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3]
                                                    : x_dims[1] * x_dims[2]);
 
+#ifdef __HIPCC__
+    int block_size = std::max(std::min(256, imsize), 64);
+#else
     int block_size = std::min(1024, imsize);
+#endif
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
     int flags =
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d75e385e8f3b7c88c393c7195b49e17397f08aa
--- /dev/null
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -0,0 +1,69 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/increment_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+class Variable;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IncrementalNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x_tensor = context.Input<framework::Tensor>("X");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+    float step = context.Attr<float>("step");
+    out_tensor->mutable_data<T>(context.GetPlace());
+
+    Tensor step_tensor(x_tensor->type());
+
+    step_tensor.mutable_data<T>({1}, context.GetPlace());
+    FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
+
+    auto runner =
+        NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
+
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    increment,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::IncrementalNPUKernel<paddle::platform::NPUDeviceContext,
+                              plat::float16>)
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bde349b0a33b9db6c20427319e09bc696b492a3f
--- /dev/null
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(increment);
+USE_OP_DEVICE_KERNEL(increment, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  init.push_back(static_cast<T>(1.0));
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({1});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  f::AttributeMap attr_input = {{"step", static_cast<float>(2.0)}};
+  auto op = f::OpRegistry::CreateOp("increment", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attr_input);
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1);
+  EXPECT_EQ(out_vec[0], static_cast<T>(3.0));
+}
+
+TEST(increment, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "increment");
+}
+
+TEST(increment, NPU_fp64) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<double>(&scope, *ctx, "increment");
+}
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index 652c071be6b33fad4c0cfb57d69d77b9afb92e54..8234d63d681ff08c9d4e5db82d89ab42e579286b 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -16,7 +16,6 @@
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 6c488c387f81500bf12b9a7cc8102944ffb301c4..445d129d07c14b8300a04ac311501f96c96c2175 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -88,8 +88,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
         platform::errors::InvalidArgument(
             "OutSize's dimension size must be 1, but got dimention = %d .",
             out_size_dim.size()));
-    PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument(
-                                              "OutSize's dim[0] must be 1"));
+    PADDLE_ENFORCE_EQ(
+        out_size_dim[0], 1,
+        platform::errors::InvalidArgument(
+            "OutSize's 0-th dimension's value must be 1, but got value = %d .",
+            out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
   }
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 5fbde701fcef6935db02e2f7758693edb3623b58..0c90a3869a2a20aa01ff07aca5585c2b7f480277 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -32,9 +32,11 @@ inline std::vector<int> get_new_shape(
   std::vector<int> vec_new_shape;
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(), framework::make_ddim({1}),
-        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
+                      platform::errors::InvalidArgument(
+                          "The shape of dimension tensor should be [1],"
+                          "but received d%.",
+                          tensor->dims()));
     if (platform::is_gpu_place(tensor->place())) {
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index 3362f2474fe25274ea0c19b6f77e46c045e4a232..a4353420c84a9aebb15e6b9d37f52e47eabd8b31 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -14,6 +14,9 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -73,9 +76,12 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
     if (scale.size() > 0) {
       float scale_w = -1;
       scale_w = scale[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
       if (scale_w > 0.) {
         // round down
         out_w = (data_layout == DataLayout::kNCHW
@@ -96,8 +102,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
         platform::errors::InvalidArgument(
             "OutSize's dimension size must be 1, but got dimention = %d .",
             out_size_dim.size()));
-    PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument(
-                                              "OutSize's dim[0] must be 1"));
+    PADDLE_ENFORCE_EQ(
+        out_size_dim[0], 1,
+        platform::errors::InvalidArgument(
+            "OutSize's 0-th dimension's value must be 1, but got value = %d .",
+            out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
   }
@@ -170,9 +179,17 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
       scale_h = scale[0];
       scale_w = scale[1];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
       if (scale_h > 0. && scale_w > 0.) {
         // round down
         out_h = (data_layout == DataLayout::kNCHW
@@ -278,9 +295,23 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
       scale_h = scale[1];
       scale_w = scale[2];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
       if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
         // round down
         out_d = (data_layout == DataLayout::kNCHW
@@ -359,13 +390,41 @@ class InterpolateV2Op : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    auto interp_method = ctx.Attr<std::string>("interp_method");
+    // TODO(danqing): support other interp_method
+    if (this->CanMKLDNNBeUsed(ctx, data_type) &&
+        (interp_method == "nearest" || interp_method == "bilinear")) {
+      layout = framework::DataLayout::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
+    }
+#endif
+
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
   }
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
+#ifdef PADDLE_WITH_MKLDNN
+    if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
+        (tensor.layout() != framework::DataLayout::kMKLDNN)) {
+      auto attrs = Attrs();
+      auto ar = paddle::framework::AttrReader(attrs);
+      const std::string data_format = ar.Get<std::string>("data_layout");
+      auto dl = framework::StringToDataLayout(data_format);
+      // Some models may have intentionally set "AnyLayout" for pool
+      // op. Treat this as NCHW (default data_format value)
+      if (dl != framework::DataLayout::kAnyLayout) {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(), dl);
+      }
+    }
+#endif
     if (var_name == "SizeTensor" || var_name == "Scale") {
       return expected_kernel_type;
     }
@@ -436,6 +495,9 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
                  "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
                  "can be \'1\' for src_idx = scale*dst_index .")
         .SetDefault(1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
@@ -672,6 +734,8 @@ REGISTER_OP_CPU_KERNEL(bilinear_interp_v2_grad,
                        ops::InterpolateV2GradKernel<double>);
 REGISTER_OP_CPU_KERNEL(nearest_interp_v2, ops::InterpolateV2Kernel<float>,
                        ops::InterpolateV2Kernel<double>,
+                       ops::InterpolateV2Kernel<int>,
+                       ops::InterpolateV2Kernel<int64_t>,
                        ops::InterpolateV2Kernel<uint8_t>);
 REGISTER_OP_CPU_KERNEL(nearest_interp_v2_grad,
                        ops::InterpolateV2GradKernel<float>,
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 90abcaa8b472a9c9171faf17cd9992777a985902..6745592c5c1a8bb951059c55901e691ed274601e 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -12,6 +12,8 @@
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/operators/interpolate_v2_op.h"
+#include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/gpu_launch_config.h"
 
@@ -302,81 +304,214 @@ __global__ void KeBilinearInterpFw(
 }
 
 template <typename T>
-__global__ void KeBilinearInterpBw(
-    T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h,
-    const size_t input_w, const T* out, const size_t out_img_h,
-    const size_t out_img_w, const size_t output_h, const size_t output_w,
-    const size_t num_channels, const T ratio_h, const T ratio_w,
-    const bool align_corners, const int align_mode,
-    const DataLayout data_layout) {
-  int nthreads = output_h * output_w;
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  bool align_flag = (align_mode == 0 && !align_corners);
-  for (; tid < nthreads; tid += stride) {
-    int out_id_h = tid / output_w;
-    int out_id_w = tid % output_w;
-    int in_img_size = input_w / num_channels;
-    int out_img_size = output_w / num_channels;
+__forceinline__ __device__ void PreCalculatorForInputIndex(
+    int* in_img_idx, int* in_img_idy, int* w_id, int* h_id, T* w1lambda,
+    T* h1lambda, T* w2lambda, T* h2lambda, T src_w, T src_h, const int in_img_w,
+    const int in_img_h) {
+  src_w = (src_w > 0) ? src_w : 0.f;
+  src_h = (src_h > 0) ? src_h : 0.f;
+  *in_img_idx = static_cast<int>(src_w);
+  *in_img_idy = static_cast<int>(src_h);
+  *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0;
+  *h_id = (*in_img_idy < in_img_h - 1) ? 1 : 0;
+  *w1lambda = src_w - *in_img_idx;
+  *h1lambda = src_h - *in_img_idy;
+  *w2lambda = 1.f - *w1lambda;
+  *h2lambda = 1.f - *h1lambda;
+}
 
-    int channel_id, out_img_idy, out_img_idx;
-    if (data_layout == DataLayout::kNCHW) {
-      channel_id = out_id_w / out_img_size;
-      out_img_idy = (out_id_w % out_img_size) / out_img_w;
-      out_img_idx = tid % out_img_w;
-    } else {
-      out_img_idy = out_id_w / (out_img_w * num_channels);
-      out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels;
-      channel_id = tid % num_channels;
+/* Calculate the minimum of partial elements in a block */
+template <typename T>
+__inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block,
+                                        unsigned mask) {
+  __shared__ T shared[WARP_SIZE];
+  __shared__ T shared_last_val;
+  __shared__ int shared_last_idx;
+  int lane = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+  int threshold = (threads_num_in_block & (-WARP_SIZE));
+
+  if (threadIdx.x < threshold) {
+    shared_last_idx = (threshold >> 5) - 1;
+    val = math::warpReduceMin(val, mask);
+    if (lane == 0) {
+      shared[wid] = val;
     }
+  } else {
+    shared_last_val = std::numeric_limits<T>::max();
+    platform::CudaAtomicMin(&shared_last_val, val);
+    shared[wid] = shared_last_val;
+    shared_last_idx = wid;
+  }
+  __syncthreads();
 
-    int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5
-                                : ratio_h * out_img_idy;
-    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
-    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
+  if (threadIdx.x < threshold) {
+    val = (lane <= shared_last_idx) ? shared[lane]
+                                    : std::numeric_limits<T>::max();
+    val = math::warpReduceMin(val, mask);
+    shared_last_val = val;
+  }
+  __syncthreads();
+  if (threadIdx.x >= threshold) {
+    val = shared_last_val;
+  }
+  return val;
+}
 
-    int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5
-                                : ratio_w * out_img_idx;
-    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
-    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
+template <typename T>
+__global__ void KeBilinearInterpBwShareMemory(
+    T* in, const int in_h, const int in_w, const T* __restrict__ out,
+    const int out_h, const int out_w, const int n, const int num_channels,
+    float ratio_h, float ratio_w, const T align_type_value, bool is_nchw) {
+  __shared__ T s_data[2][1024];
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int in_chw = in_h * in_w * num_channels;
+  int out_chw = num_channels * out_h * out_w;
+  int nthreads = n * out_chw;
 
-    T* in_pos;
-    if (data_layout == DataLayout::kNCHW) {
-      in_pos = &in[out_id_h * input_w + channel_id * in_img_size +
-                   in_img_idy * in_img_w + in_img_idx];
+  for (; tid < nthreads; tid += stride) {
+    int out_id_h = tid / out_chw;
+    int out_id_w = tid % out_chw;
+    const int in_img_size = in_h * in_w;
+    const int out_img_size = out_h * out_w;
+    T value = out[out_id_h * out_chw + out_id_w];
+
+    int channel_id = out_id_w / out_img_size;
+    int out_img_idy = (out_id_w % out_img_size) / out_w;
+    int out_img_idx = tid % out_w;
+
+    int in_img_idx, in_img_idy, w_id, h_id;
+    T w1lambda, h1lambda, w2lambda, h2lambda;
+    T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
+    T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+    PreCalculatorForInputIndex(&in_img_idx, &in_img_idy, &w_id, &h_id,
+                               &w1lambda, &h1lambda, &w2lambda, &h2lambda,
+                               src_w, src_h, in_w, in_h);
+
+    // top_left_index is just input_index.
+    int input_index = out_id_h * in_chw + channel_id * in_img_size +
+                      in_img_idy * in_w + in_img_idx;
+    int top_right_index = input_index + w_id;
+    int bot_left_index = input_index + h_id * in_w;
+    int bot_right_index = input_index + h_id * in_w + w_id;
+    int in_top_min_index, in_bot_min_index;
+
+    s_data[0][threadIdx.x] = 0.f;
+    s_data[1][threadIdx.x] = 0.f;
+    int remain = nthreads - (tid & (-blockDim.x));
+    int in_top_max_index = math::blockReduceMax(top_right_index, FINAL_MASK);
+    int in_bot_max_index = math::blockReduceMax(bot_right_index, FINAL_MASK);
+
+    if (remain > blockDim.x) {
+      in_top_min_index = math::blockReduceMin(input_index, FINAL_MASK);
+      in_bot_min_index = math::blockReduceMin(bot_left_index, FINAL_MASK);
     } else {
-      in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels +
-                   in_img_idx * num_channels + channel_id];
+      in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK);
+      in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK);
     }
+    int upper_limit_share_idx = (in_top_max_index - in_top_min_index) >
+                                        (in_bot_max_index - in_bot_min_index)
+                                    ? (in_top_max_index - in_top_min_index)
+                                    : (in_bot_max_index - in_bot_min_index);
+    if (h_id != 0) {
+      platform::CudaAtomicAdd(&s_data[0][input_index - in_top_min_index],
+                              h2lambda * w2lambda * value);
+      platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index],
+                              h2lambda * w1lambda * value);
+      platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index],
+                              h1lambda * w2lambda * value);
+      platform::CudaAtomicAdd(&s_data[1][bot_right_index - in_bot_min_index],
+                              h1lambda * w1lambda * value);
+    } else {
+      platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index],
+                              (h2lambda + h1lambda) * w1lambda * value);
+      platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index],
+                              (h1lambda + h2lambda) * w2lambda * value);
+    }
+    __syncthreads();
 
-    const T* out_pos = &out[out_id_h * output_w + out_id_w];
+    if (threadIdx.x <= upper_limit_share_idx) {
+      platform::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x],
+                              s_data[0][threadIdx.x]);
+      platform::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x],
+                              s_data[1][threadIdx.x]);
+    }
+  }
+}
 
-    if (data_layout == DataLayout::kNCHW) {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
-                              h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
-                              h1lambda * w1lambda * out_pos[0]);
-    } else {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+template <typename T>
+__global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w,
+                                   const T* __restrict__ out, const int out_h,
+                                   const int out_w, const int n,
+                                   const int num_channels, float ratio_h,
+                                   float ratio_w, const T align_type_value,
+                                   bool is_nchw) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int in_chw = in_h * in_w * num_channels;
+  int out_chw = num_channels * out_h * out_w;
+  int nthreads = n * out_chw;
+
+  if (is_nchw) {
+    for (; tid < nthreads; tid += stride) {
+      int out_id_h = tid / out_chw;
+      int out_id_w = tid % out_chw;
+      const int in_img_size = in_h * in_w;
+      const int out_img_size = out_h * out_w;
+      T value = out[out_id_h * out_chw + out_id_w];
+
+      int channel_id = out_id_w / out_img_size;
+      int out_img_idy = (out_id_w % out_img_size) / out_w;
+      int out_img_idx = tid % out_w;
+      int in_img_idx, in_img_idy, w_id, h_id;
+      T w1lambda, h1lambda, w2lambda, h2lambda;
+
+      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
+      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+      PreCalculatorForInputIndex(&in_img_idx, &in_img_idy, &w_id, &h_id,
+                                 &w1lambda, &h1lambda, &w2lambda, &h2lambda,
+                                 src_w, src_h, in_w, in_h);
+
+      T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size +
+                      in_img_idy * in_w + in_img_idx];
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
+      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_w],
+                              h1lambda * w2lambda * value);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id],
+                              h1lambda * w1lambda * value);
+    }
+  } else {
+    for (; tid < nthreads; tid += stride) {
+      int out_id_h = tid / out_chw;
+      int out_id_w = tid % out_chw;
+      const int in_img_size = in_h * in_w;
+      const int out_img_size = out_h * out_w;
+      T value = out[out_id_h * out_chw + out_id_w];
+
+      int out_img_idy = out_id_w / (out_w * num_channels);
+      int out_img_idx = out_id_w % (out_w * num_channels) / num_channels;
+      int channel_id = tid % num_channels;
+
+      int in_img_idx, in_img_idy, w_id, h_id;
+      T w1lambda, h1lambda, w2lambda, h2lambda;
+      T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
+      T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+      PreCalculatorForInputIndex(&in_img_idx, &in_img_idy, &w_id, &h_id,
+                                 &w1lambda, &h1lambda, &w2lambda, &h2lambda,
+                                 src_w, src_h, in_w, in_h);
+
+      T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels +
+                      in_img_idx * num_channels + channel_id];
+      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value);
       platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                              h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
-                              h1lambda * w2lambda * out_pos[0]);
+                              h2lambda * w1lambda * value);
+      platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels],
+                              h1lambda * w2lambda * value);
       platform::CudaAtomicAdd(
-          &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
-          h1lambda * w1lambda * out_pos[0]);
+          &in_pos[h_id * in_w * num_channels + w_id * num_channels],
+          h1lambda * w1lambda * value);
     }
   }
 }
@@ -847,15 +982,21 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale_w = scale_data[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     } else {
       if (scale.size() > 0) {
         scale_w = scale[0];
-        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                                 "scale  of Op(interpolate) "
-                                                 "should be greater than 0."));
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
       }
     }
     if (scale_w > 0.) {
@@ -946,18 +1087,36 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
         scale_h = scale_data[0];
         scale_w = scale_data[0];
       }
+
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     } else {
       if (scale.size() > 1) {
         scale_w = scale[1];
         scale_h = scale[0];
+
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
       }
     }
     if (scale_w > 0. && scale_h > 0.) {
@@ -1081,10 +1240,25 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
         scale_h = scale_data[0];
         scale_w = scale_data[0];
       }
+
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     } else {
       if (scale.size() > 1) {
         scale_d = scale[0];
@@ -1092,9 +1266,23 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
         scale_w = scale[2];
 
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
+        PADDLE_ENFORCE_EQ(
+            scale_d > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_d in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_d));
       }
     }
     if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
@@ -1199,16 +1387,22 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale_w = scale_data[0];
-    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                             "scale  of Op(interpolate) "
-                                             "should be greater than 0."));
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
   } else {
     if (scale.size() > 0) {
       scale_w = scale[0];
 
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     }
   }
   if (scale_w > 0.) {
@@ -1298,19 +1492,36 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
       scale_h = scale_data[0];
       scale_w = scale_data[0];
     }
+
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
+    PADDLE_ENFORCE_EQ(
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
   } else {
     if (scale.size() > 1) {
       scale_w = scale[1];
       scale_h = scale[0];
 
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     }
   }
   if (scale_w > 0. && scale_h > 0.) {
@@ -1373,7 +1584,6 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   int out_hw = out_h * out_w;
   int in_chw = c * in_hw;
   int out_chw = c * out_hw;
-
   int pixelNum = n * out_chw;
 
   platform::GpuLaunchConfig config =
@@ -1386,11 +1596,25 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
         input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
         n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout);
   } else if ("bilinear" == interp_method) {
-    KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                            ctx.cuda_device_context().stream()>>>(
-        input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w,
-        n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode,
-        data_layout);
+    const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0;
+    bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false;
+    bool optimize_flag = false;
+    optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6))
+                        ? true
+                        : ((in_h == 1 && in_w == 1) ? true : false);
+
+    if (optimize_flag & is_nchw) {
+      KeBilinearInterpBwShareMemory<
+          T><<<config.block_per_grid, config.thread_per_block, 0,
+               ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
+          ratio_h, ratio_w, align_type_value, is_nchw);
+    } else {
+      KeBilinearInterpBw<T><<<config.block_per_grid, config.thread_per_block, 0,
+                              ctx.cuda_device_context().stream()>>>(
+          input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c,
+          ratio_h, ratio_w, align_type_value, is_nchw);
+    }
   } else if ("bicubic" == interp_method) {
     KeBicubicInterpBw<T><<<config.block_per_grid, 512, 0,
                            ctx.cuda_device_context().stream()>>>(
@@ -1433,9 +1657,23 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
       scale_w = scale_data[0];
     }
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
+    PADDLE_ENFORCE_EQ(
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
+    PADDLE_ENFORCE_EQ(
+        scale_d > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_d));
   } else {
     if (scale.size() > 1) {
       scale_d = scale[0];
@@ -1443,9 +1681,23 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
       scale_w = scale[2];
 
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     }
   }
   if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
@@ -1590,6 +1842,7 @@ REGISTER_OP_CUDA_KERNEL(bilinear_interp_v2_grad,
 REGISTER_OP_CUDA_KERNEL(nearest_interp_v2,
                         ops::InterpolateOpV2CUDAKernel<float>,
                         ops::InterpolateOpV2CUDAKernel<double>,
+                        ops::InterpolateOpV2CUDAKernel<int64_t>,
                         ops::InterpolateOpV2CUDAKernel<int>);
 REGISTER_OP_CUDA_KERNEL(nearest_interp_v2_grad,
                         ops::InterpolateV2GradOpCUDAKernel<float>,
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 4e4fd9ff63ba47b41363a81d6cc527486671d695..ebab5794edc517f2d4b57dfb3792f09ea224cd03 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -32,9 +32,11 @@ inline std::vector<int> get_new_shape(
   std::vector<int> vec_new_shape;
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
     auto tensor = list_new_shape_tensor[i];
-    PADDLE_ENFORCE_EQ(
-        tensor->dims(), framework::make_ddim({1}),
-        platform::errors::InvalidArgument("shape of dim tensor should be [1]"));
+    PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}),
+                      platform::errors::InvalidArgument(
+                          "The shape of dimension tensor should be [1],"
+                          "but received d%.",
+                          tensor->dims()));
     if (platform::is_gpu_place(tensor->place())) {
       framework::Tensor temp;
       TensorCopySync(*tensor, platform::CPUPlace(), &temp);
@@ -795,16 +797,22 @@ static void Interpolate1DCPUFwd(const framework::ExecutionContext& ctx,
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale_w = scale_data[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     } else {
       if (scale.size() > 0) {
         scale_w = scale[0];
 
-        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                                 "scale  of Op(interpolate) "
-                                                 "should be greater than 0."));
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
       }
     }
     if (scale_w > 0.) {
@@ -882,18 +890,34 @@ static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx,
         scale_w = scale_data[0];
       }
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     } else {
       if (scale.size() > 1) {
         scale_h = scale[0];
         scale_w = scale[1];
 
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
       }
     }
     if (scale_h > 0. && scale_w > 0.) {
@@ -998,9 +1022,23 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
         scale_w = scale_data[0];
       }
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     } else {
       if (scale.size() > 1) {
         scale_d = scale[0];
@@ -1008,9 +1046,23 @@ static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx,
         scale_w = scale[2];
 
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0 && scale_d, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
+        PADDLE_ENFORCE_EQ(
+            scale_d > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_d in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_d));
       }
     }
     if (scale_w > 0. && scale_h > 0. && scale_d > 0.) {
@@ -1102,15 +1154,21 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale_w = scale_data[0];
-    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                             "scale  of Op(interpolate) "
-                                             "should be greater than 0."));
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
   } else {
     if (scale.size() > 0) {
       scale_w = scale[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     }
   }
   if (scale_w > 0.) {
@@ -1188,17 +1246,33 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
       scale_h = scale_data[0];
     }
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
+    PADDLE_ENFORCE_EQ(
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
   } else {
     if (scale.size() > 1) {
       scale_h = scale[0];
       scale_w = scale[1];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     }
   }
   if (scale_h > 0. && scale_w > 0.) {
@@ -1301,18 +1375,46 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
       scale_w = scale_data[0];
     }
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
+    PADDLE_ENFORCE_EQ(
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
+    PADDLE_ENFORCE_EQ(
+        scale_d > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_d));
   } else {
     if (scale.size() > 1) {
       scale_d = scale[0];
       scale_h = scale[1];
       scale_w = scale[2];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     }
   }
   if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 419c4d44b6d364d7cfa397ec77810a934998ff4f..a8e441a96717dff8a81a5921d9c569e588738999 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -330,7 +330,10 @@ void BenchKernelSgd() {
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
     }
-    std::random_shuffle(all.begin(), all.end());
+    std::random_device rnd;
+    int64_t seed_tmp = rnd();
+    std::default_random_engine rng(seed_tmp);
+    std::shuffle(all.begin(), all.end(), rng);
     out.insert(out.begin(), all.begin(), all.begin() + n);
     return out;
   };
diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc
index cfddbf213ef73e2394a032196ede482b5bb8b4b4..ff68565637c5a98f6f8bf5021ac685846edc605d 100644
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -861,7 +861,10 @@ void TestKernelSgd() {
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
     }
-    std::random_shuffle(all.begin(), all.end());
+    std::random_device rnd;
+    int64_t seed_tmp = rnd();
+    std::default_random_engine rng(seed_tmp);
+    std::shuffle(all.begin(), all.end(), rng);
     out.insert(out.begin(), all.begin(), all.begin() + n);
     return out;
   };
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index d0f7dca98af0f66dc81352908616f92376aa54ae..3656de3525d32cac814e4199089de56b40ea09d8 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -43,7 +43,11 @@ template <typename T>
 using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
 inline static int GetDesiredBlockDim(int block_dim) {
+#ifdef __HIPCC__
+  const int kMaxBlockDim = 256;
+#else
   const int kMaxBlockDim = 512;
+#endif
   return block_dim >= kMaxBlockDim
              ? kMaxBlockDim
              : (1 << (static_cast<int>(std::log2f(block_dim))));
@@ -698,8 +702,11 @@ static void LayerNormBackward(const T *x, const T *d_y, const U *scale,
                               const framework::ExecutionContext &ctx) {
   auto &dev_ctx = ctx.cuda_device_context();
   auto stream = dev_ctx.stream();
-
+#ifdef __HIPCC__
+  const int kMaxBlockDim = 256;
+#else
   const int kMaxBlockDim = 512;
+#endif
   const int kMaxBlockNum = 128;
   int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) |
                       ((d_scale != nullptr ? 1 : 0) << 1) |
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0c228ef22af3e24f1ea6e1bc8607cda718ed40e
--- /dev/null
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -0,0 +1,387 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/layer_norm_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+class NormDataType;
+
+template <>
+class NormDataType<platform::float16> {
+ public:
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+};
+
+template <>
+class NormDataType<float> {
+ public:
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+};
+
+template <typename T>
+using NormDataType = NormDataType<T>;
+template <typename T>
+using LayerNormParamType = typename NormDataType<T>::BatchNormParamType;
+
+template <typename T>
+class LayerNormNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using U = LayerNormParamType<T>;
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto epsilon = ctx.Attr<float>("epsilon");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* bias = ctx.Input<Tensor>("Bias");
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* variance = ctx.Output<Tensor>("Variance");
+    const auto& x_dims = x->dims();
+    std::vector<int> axes;
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    // The shape of scale and bias should be equal to x.shape[begin_norm_axis:],
+    // required by Ascend.
+    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
+      axes.push_back(x_dims[i]);
+    }
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    Tensor default_scale(x->type());
+    if (!scale) {
+      default_scale.mutable_data<T>(framework::make_ddim(axes), place);
+      Tensor value(x->type());
+      value.mutable_data<T>({1}, place);
+      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
+      auto runner =
+          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
+      runner.Run(stream);
+      scale = &default_scale;
+    } else {
+      const_cast<Tensor*>(scale)->Resize(framework::make_ddim(axes));
+    }
+
+    Tensor default_bias(x->type());
+    if (!bias) {
+      default_bias.mutable_data<T>(framework::make_ddim(axes), place);
+      Tensor value(x->type());
+      value.mutable_data<T>({1}, place);
+      FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
+      auto runner =
+          NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
+      runner.Run(stream);
+      bias = &default_bias;
+    } else {
+      const_cast<Tensor*>(bias)->Resize(framework::make_ddim(axes));
+    }
+
+    // cast scale from LayerNormParamType to T if needed
+    Tensor cast_scale(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        scale->type() == framework::proto::VarType::FP32) {
+      cast_scale.Resize(scale->dims());
+      cast_scale.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_scale =
+          NpuOpRunner("Cast", {*scale}, {cast_scale},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_scale.Run(stream);
+    } else {
+      cast_scale.ShareDataWith(*scale);
+    }
+
+    // cast bias from LayerNormParamType to T if needed
+    Tensor cast_bias(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        bias->type() == framework::proto::VarType::FP32) {
+      cast_bias.Resize(bias->dims());
+      cast_bias.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_bias =
+          NpuOpRunner("Cast", {*bias}, {cast_bias},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_bias.Run(stream);
+    } else {
+      cast_bias.ShareDataWith(*bias);
+    }
+
+    y->mutable_data<T>(ctx.GetPlace());
+
+    // mean should be of  U type
+    Tensor* tmp_mean = mean;
+    Tensor cast_mean(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (scale->type() == framework::proto::VarType::FP32 ||
+         bias->type() == framework::proto::VarType::FP32)) {
+      cast_mean.Resize(mean->dims());
+      cast_mean.mutable_data<T>(ctx.GetPlace());
+      tmp_mean = &cast_mean;
+      mean->mutable_data<U>(ctx.GetPlace());
+    } else {
+      mean->mutable_data<T>(ctx.GetPlace());
+    }
+
+    // same for variance
+    Tensor* tmp_variance = variance;
+    Tensor cast_variance(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (scale->type() == framework::proto::VarType::FP32 ||
+         bias->type() == framework::proto::VarType::FP32)) {
+      cast_variance.Resize(variance->dims());
+      cast_variance.mutable_data<T>(ctx.GetPlace());
+      tmp_variance = &cast_variance;
+      variance->mutable_data<U>(ctx.GetPlace());
+    } else {
+      variance->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias},
+                              {*y, *tmp_mean, *tmp_variance},
+                              {{"begin_norm_axis", begin_norm_axis},
+                               {"begin_params_axis", begin_norm_axis},
+                               {"epsilon", epsilon}});
+    runner.Run(stream);
+
+    // cast back from FP16 to FP32
+    if (x->type() == framework::proto::VarType::FP16 &&
+        mean->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(mean->type());
+      auto runner_cast_mean =
+          NpuOpRunner("Cast", {*tmp_mean}, {*mean},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_mean.Run(stream);
+    }
+    // same for variance
+    if (x->type() == framework::proto::VarType::FP16 &&
+        variance->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(variance->type());
+      auto runner_cast_variance =
+          NpuOpRunner("Cast", {*tmp_variance}, {*variance},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_variance.Run(stream);
+    }
+
+    // revert shape of scale and bias
+    // TODO(zhiqiu): better implementation, use tmp tensor to avoid write input
+    // tensor.
+    const_cast<Tensor*>(scale)->Resize(framework::make_ddim({right}));
+    const_cast<Tensor*>(bias)->Resize(framework::make_ddim({right}));
+  }
+};
+
+template <typename T>
+class LayerNormGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using U = LayerNormParamType<T>;
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    const auto* x = ctx.Input<Tensor>("X");
+    const auto& x_dims = x->dims();
+    const auto* mean = ctx.Input<Tensor>("Mean");
+    const auto* variance = ctx.Input<Tensor>("Variance");
+    const auto* scale = ctx.Input<Tensor>("Scale");
+    const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dscale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* dbias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    std::vector<int> axes;
+    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
+      axes.push_back(x_dims[i]);
+    }
+
+    auto place = ctx.GetPlace();
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // No need to compute any gradient, jusr return
+    if (!dx && !dscale && !dbias) {
+      return;
+    }
+
+    // The rank of mean should be equal to x, required by Ascend.
+    std::vector<int> new_shape;
+    for (auto i = 0; i < begin_norm_axis; ++i) {
+      new_shape.push_back(x_dims[i]);
+    }
+    for (auto i = begin_norm_axis; i < x_dims.size(); ++i) {
+      new_shape.push_back(1);
+    }
+
+    auto mean_dims = mean->dims();
+    const_cast<Tensor*>(mean)->Resize(framework::make_ddim({new_shape}));
+    const_cast<Tensor*>(variance)->Resize(framework::make_ddim({new_shape}));
+
+    Tensor default_scale(x->type());
+    if (!scale) {
+      default_scale.mutable_data<T>(framework::make_ddim(axes), place);
+      Tensor value(x->type());
+      value.mutable_data<T>({1}, place);
+      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
+      auto runner =
+          NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
+      runner.Run(stream);
+      scale = &default_scale;
+    } else {
+      const_cast<Tensor*>(scale)->Resize(framework::make_ddim(axes));
+    }
+
+    // cast scale from LayerNormParamType to T if needed
+    Tensor cast_scale(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        scale->type() == framework::proto::VarType::FP32) {
+      cast_scale.Resize(scale->dims());
+      cast_scale.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_scale =
+          NpuOpRunner("Cast", {*scale}, {cast_scale},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_scale.Run(stream);
+    } else {
+      cast_scale.ShareDataWith(*scale);
+    }
+
+    // cast mean from LayerNormParamType to T if needed
+    Tensor cast_mean(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        mean->type() == framework::proto::VarType::FP32) {
+      cast_mean.Resize(mean->dims());
+      cast_mean.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_mean =
+          NpuOpRunner("Cast", {*mean}, {cast_mean},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_mean.Run(stream);
+    } else {
+      cast_mean.ShareDataWith(*mean);
+    }
+
+    // cast variance from LayerNormParamType to T if needed
+    Tensor cast_variance(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        variance->type() == framework::proto::VarType::FP32) {
+      cast_variance.Resize(variance->dims());
+      cast_variance.mutable_data<T>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(x->type());
+      auto runner_cast_variance =
+          NpuOpRunner("Cast", {*variance}, {cast_variance},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_variance.Run(stream);
+    } else {
+      cast_variance.ShareDataWith(*variance);
+    }
+
+    Tensor dx_(dy->type()), dscale_(dy->type()), dbias_(dy->type());
+    dx = (dx == nullptr) ? &dx_ : dx;
+    dscale = (dscale == nullptr) ? &dscale_ : dscale;
+    dbias = (dbias == nullptr) ? &dbias_ : dbias;
+
+    dx->Resize(x->dims());
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    dscale->Resize(framework::make_ddim(axes));
+
+    dbias->Resize(framework::make_ddim(axes));
+
+    // dscale should be of  U type
+    Tensor* tmp_dscale = dscale;
+    Tensor cast_dscale(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (mean->type() == framework::proto::VarType::FP32 ||
+         variance->type() == framework::proto::VarType::FP32)) {
+      cast_dscale.Resize(dscale->dims());
+      cast_dscale.mutable_data<T>(ctx.GetPlace());
+      tmp_dscale = &cast_dscale;
+      dscale->mutable_data<U>(ctx.GetPlace());
+    } else {
+      dscale->mutable_data<T>(ctx.GetPlace());
+    }
+
+    // same for dbias
+    Tensor* tmp_dbias = dbias;
+    Tensor cast_dbias(x->type());
+    if (x->type() == framework::proto::VarType::FP16 &&
+        (mean->type() == framework::proto::VarType::FP32 ||
+         variance->type() == framework::proto::VarType::FP32)) {
+      cast_dbias.Resize(dbias->dims());
+      cast_dbias.mutable_data<T>(ctx.GetPlace());
+      tmp_dbias = &cast_dbias;
+      dbias->mutable_data<U>(ctx.GetPlace());
+    } else {
+      dbias->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto runner = NpuOpRunner("LayerNormGrad",
+                              {*dy, *x, cast_variance, cast_mean, cast_scale},
+                              {*dx, *tmp_dscale, *tmp_dbias}, {});
+    runner.Run(stream);
+
+    // cast back from FP16 to FP32
+    if (x->type() == framework::proto::VarType::FP16 &&
+        dscale->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(dscale->type());
+      auto runner_cast_dscale =
+          NpuOpRunner("Cast", {*tmp_dscale}, {*dscale},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_dscale.Run(stream);
+    }
+    // same for dbias
+    if (x->type() == framework::proto::VarType::FP16 &&
+        dbias->type() == framework::proto::VarType::FP32) {
+      auto dst_dtype = ConvertToNpuDtype(dbias->type());
+      auto runner_cast_dbias =
+          NpuOpRunner("Cast", {*tmp_dbias}, {*dbias},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_dbias.Run(stream);
+    }
+
+    const_cast<Tensor*>(mean)->Resize(mean_dims);
+    const_cast<Tensor*>(variance)->Resize(mean_dims);
+    const_cast<Tensor*>(scale)->Resize(framework::make_ddim({right}));
+    dscale->Resize(framework::make_ddim({right}));
+    dbias->Resize(framework::make_ddim({right}));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(layer_norm, ops::LayerNormNPUKernel<float>,
+                       ops::LayerNormNPUKernel<plat::float16>);
+REGISTER_OP_NPU_KERNEL(layer_norm_grad, ops::LayerNormGradNPUKernel<float>,
+                       ops::LayerNormGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b9b96c23b0b71cf941c5663ffba0d75dbf4a41a
--- /dev/null
+++ b/paddle/fluid/operators/load_combine_op_npu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f5328034583101d7456e6235eb0d90cf16cae48
--- /dev/null
+++ b/paddle/fluid/operators/load_op_npu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 02fca246d241d476b5540a6af8f49b16d4dae416..e4fe92c625640dba38daa6690705eed2cf0032be 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -12,7 +12,297 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <limits>
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/log_softmax_op.h"
+#include "paddle/fluid/platform/cuda_device_function.h"
+
+namespace paddle {
+namespace operators {
+
+#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two)                \
+  case near_greater_power_of_two:                                            \
+    ComputeLogSoftmaxForwardInWarp<                                          \
+        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
+        dst, src, outer_size, dim_size);                                     \
+    break;
+
+template <typename T, int KernelWarpSize>
+__device__ __forceinline__ T WarpReduceSum(T value) {
+#pragma unroll
+  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
+    T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
+    value = value + sum_val;
+  }
+  return value;
+}
+
+template <typename T, int KernelWarpSize>
+__device__ __forceinline__ T WarpReduceMax(T value) {
+#pragma unroll
+  for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) {
+    T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset);
+    value = max(value, max_val);
+  }
+  return value;
+}
+
+int GetNearGreaterPowerOfTwo(int value) {
+  int log2_value = 0;
+  while ((1 << log2_value) < value) {
+    ++log2_value;
+  }
+  return 1 << log2_value;
+}
+
+template <typename T, typename AccT, int NearGreaterPowerOfTwo>
+__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
+                                               int batch_size,
+                                               int element_count) {
+  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
+  constexpr int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
+  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
+
+  int thread_in_warp_idx = threadIdx.x;
+
+  // 1.read data from global memory to registers
+  AccT elements[warp_iter];
+  // set effective_element_count as the num of elements when warps do effective
+  // work
+  // set effective_element_count as 0, when warps do ineffective work
+  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
+  for (int it = 0; it < warp_iter; ++it) {
+    int element_index = thread_in_warp_idx + it * kernel_warp_size;
+    if (element_index < effective_element_count) {
+      elements[it] =
+          static_cast<AccT>(src[batch_id * element_count + element_index]);
+    } else {
+      elements[it] = -std::numeric_limits<AccT>::infinity();
+    }
+  }
+
+  // 2.compute max_value. For each thread, loop all registers to find max
+  AccT max_value = elements[0];
+#pragma unroll
+  for (int it = 1; it < warp_iter; ++it) {
+    max_value = (max_value > elements[it]) ? max_value : elements[it];
+  }
+  max_value = WarpReduceMax<AccT, kernel_warp_size>(max_value);
+
+  // 3.For each warp, accumulate all thread registers
+  AccT sum = 0.0f;
+#pragma unroll
+  for (int it = 0; it < warp_iter; ++it) {
+    sum += std::exp(elements[it] - max_value);
+  }
+  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
+
+  // 4.store result.
+  sum = std::log(sum);
+#pragma unroll
+  for (int it = 0; it < warp_iter; ++it) {
+    int element_index = thread_in_warp_idx + it * kernel_warp_size;
+    if (element_index < element_count) {
+      dst[batch_id * element_count + element_index] =
+          static_cast<T>(elements[it] - max_value - sum);
+    } else {
+      break;
+    }
+  }
+}
+
+template <typename T, typename AccT>
+void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size,
+                                     int outer_size, gpuStream_t stream) {
+  int threads_per_block = 128;
+  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
+  int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  int warps_per_block = (threads_per_block / kernel_warp_size);
+  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
+  dim3 threads(kernel_warp_size, warps_per_block, 1);
+
+  switch (near_greater_power_of_two) {
+    LAUNCH_WARP_FORWAR_COMPUTE(1);
+    LAUNCH_WARP_FORWAR_COMPUTE(2);
+    LAUNCH_WARP_FORWAR_COMPUTE(4);     // dim_size: 3~4
+    LAUNCH_WARP_FORWAR_COMPUTE(8);     // dim_size: 5~8
+    LAUNCH_WARP_FORWAR_COMPUTE(16);    // dim_size: 9~16
+    LAUNCH_WARP_FORWAR_COMPUTE(32);    // dim_size: 17~32
+    LAUNCH_WARP_FORWAR_COMPUTE(64);    // dim_size: 33~64
+    LAUNCH_WARP_FORWAR_COMPUTE(128);   // dim_size 65~128
+    LAUNCH_WARP_FORWAR_COMPUTE(256);   // dim_size 129~256
+    LAUNCH_WARP_FORWAR_COMPUTE(512);   // dim_size 257~512
+    LAUNCH_WARP_FORWAR_COMPUTE(1024);  // dim_size 513~1024
+
+    default:
+      break;
+  }
+}
+
+template <typename T>
+class LogSoftmaxKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const auto *x = context.Input<framework::Tensor>("X");
+    auto *out = context.Output<framework::Tensor>("Out");
+    const auto *input_data = x->data<T>();
+    auto *output_data = out->mutable_data<T>(context.GetPlace());
+
+    const int rank = x->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    int dim_size = x->dims()[axis];
+    int inner_size = 1;
+    for (int i = axis + 1; i < x->dims().size(); ++i) {
+      inner_size *= x->dims()[i];
+    }
+    int outer_size = SizeToAxis(axis, x->dims());
+    gpuStream_t stream = context.cuda_device_context().stream();
+
+    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
+      LaunchSoftmaxForwardForLastAxis<T, MPDType>(output_data, input_data,
+                                                  dim_size, outer_size, stream);
+    } else {
+      LogSoftmaxFunctor<platform::CUDADeviceContext, T>()(
+          context.template device_context<platform::CUDADeviceContext>(), x,
+          out, axis);
+    }
+  }
+};
+
+// Backward below
+#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two)              \
+  case near_greater_power_of_two:                                            \
+    ComputeLogSoftmaxBackwardInWarp<                                         \
+        T, AccT, near_greater_power_of_two><<<blocks, threads, 0, stream>>>( \
+        output, grad_output, grad_input, outer_size, dim_size);              \
+    break;
+
+template <typename T, typename AccT, int NearGreaterPowerOfTwo>
+__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output,
+                                                const T *grad_output,
+                                                T *grad_input, int batch_size,
+                                                int element_count) {
+  constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo;
+  constexpr int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size;
+  int batch_id = blockDim.y * blockIdx.x + threadIdx.y;
+
+  int thread_in_warp_idx = threadIdx.x % kernel_warp_size;
+
+  // 1.read data from global memory to registers
+  AccT output_register[warp_iter];
+  AccT grad_output_register[warp_iter];
+  int effective_element_count = (batch_id < batch_size) ? element_count : 0;
+  for (int iter = 0; iter < warp_iter; ++iter) {
+    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
+    if (element_index < effective_element_count) {
+      output_register[iter] =
+          static_cast<AccT>(output[batch_id * element_count + element_index]);
+      grad_output_register[iter] = static_cast<AccT>(
+          grad_output[batch_id * element_count + element_index]);
+    } else {
+      output_register[iter] = AccT(0);
+      grad_output_register[iter] = AccT(0);
+    }
+  }
+
+  // 2. For each warp, accumulate all thread registers
+  AccT sum = grad_output_register[0];
+#pragma unroll
+  for (int iter = 1; iter < warp_iter; ++iter) {
+    sum += grad_output_register[iter];
+  }
+  sum = WarpReduceSum<AccT, kernel_warp_size>(sum);
+
+// 3. write result in grad_input
+#pragma unroll
+  for (int iter = 0; iter < warp_iter; ++iter) {
+    int element_index = thread_in_warp_idx + iter * kernel_warp_size;
+    if (element_index < element_count) {
+      grad_input[batch_id * element_count + element_index] = static_cast<T>(
+          (grad_output_register[iter] - std::exp(output_register[iter]) * sum));
+    }
+  }
+}
+
+template <typename T, typename AccT>
+void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
+                                      const T *output, int dim_size,
+                                      int outer_size, gpuStream_t stream) {
+  int threads_per_block = 128;
+  int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size);
+  int kernel_warp_size =
+      (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32;
+  int warps_per_block = (threads_per_block / kernel_warp_size);
+  int blocks = (outer_size + warps_per_block - 1) / warps_per_block;
+  dim3 threads(kernel_warp_size, warps_per_block, 1);
+
+  switch (near_greater_power_of_two) {
+    LAUNCH_WARP_BACKWARD_COMPUTE(1);     // dim_size: 1
+    LAUNCH_WARP_BACKWARD_COMPUTE(2);     // dim_size: 2
+    LAUNCH_WARP_BACKWARD_COMPUTE(4);     // dim_size: 3~4
+    LAUNCH_WARP_BACKWARD_COMPUTE(8);     // dim_size: 5~8
+    LAUNCH_WARP_BACKWARD_COMPUTE(16);    // dim_size: 9~16
+    LAUNCH_WARP_BACKWARD_COMPUTE(32);    // dim_size: 17~32
+    LAUNCH_WARP_BACKWARD_COMPUTE(64);    // dim_size: 33~64
+    LAUNCH_WARP_BACKWARD_COMPUTE(128);   // dim_size: 65~128
+    LAUNCH_WARP_BACKWARD_COMPUTE(256);   // dim_size: 129~256
+    LAUNCH_WARP_BACKWARD_COMPUTE(512);   // dim_size: 257~512
+    LAUNCH_WARP_BACKWARD_COMPUTE(1024);  // dim_size: 513~1024
+
+    default:
+      break;
+  }
+}
+
+template <typename T>
+class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+  using MPDType = typename details::MPTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const auto *out = context.Input<framework::Tensor>("Out");
+    const auto *g_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *g_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    const auto *out_data = out->data<T>();
+    const auto *g_out_data = g_out->data<T>();
+    auto *g_x_data = g_x->mutable_data<T>(context.GetPlace());
+
+    const int rank = out->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+
+    int dim_size = out->dims()[axis];
+    int inner_size = 1;
+    for (int i = axis + 1; i < out->dims().size(); ++i) {
+      inner_size *= out->dims()[i];
+    }
+    int outer_size = SizeToAxis(axis, out->dims());
+    gpuStream_t stream = context.cuda_device_context().stream();
+
+    if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) {
+      LaunchSoftmaxBackwardForLastAxis<T, MPDType>(
+          g_x_data, g_out_data, out_data, dim_size, outer_size, stream);
+    } else {
+      LogSoftmaxGradFunctor<platform::CUDADeviceContext, T>()(
+          context.template device_context<platform::CUDADeviceContext>(), out,
+          g_out, g_x, axis);
+    }
+  }
+};
+
+}  // operators
+}  // paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 1b482235da54b1d47b461ee0586ee1f1794ff7c7..2e8b551ea4e43ce4dd919b6800b9b3784b4a7aac 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
 namespace operators {
@@ -222,9 +223,11 @@ REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
 
 REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
                        ops::LookupTableKernel<double>,
-                       ops::LookupTableKernel<int8_t>);
+                       ops::LookupTableKernel<int8_t>,
+                       ops::LookupTableKernel<paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>,
-                       ops::LookupTableGradKernel<double>);
+                       ops::LookupTableGradKernel<double>,
+                       ops::LookupTableGradKernel<paddle::platform::bfloat16>);
 
 /* ==========================  register checkpoint ===========================*/
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 8baa3bccceb1ae69ed22f4fe5bc344188f08e1f6..e385d72d1f43fd024158582afe08e704744f744a 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -102,7 +102,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
             auto id_index = table_t.GetIndexFromId(ids[i]);
 
             if (id_index != -1) {
-              if (input_data_type == framework::proto::VarType::INT8) {
+              if (input_data_type == framework::proto::VarType::INT8 ||
+                  input_data_type == framework::proto::VarType::BF16) {
                 memcpy(output + i * row_width, table + id_index * row_width,
                        row_width * sizeof(T));
               } else {
@@ -128,7 +129,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
                     "the input key should be exists. But received %d.",
                     id_index));
 
-            if (input_data_type == framework::proto::VarType::INT8) {
+            if (input_data_type == framework::proto::VarType::INT8 ||
+                input_data_type == framework::proto::VarType::BF16) {
               memcpy(output + i * row_width, table + id_index * row_width,
                      row_width * sizeof(T));
             } else {
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9574b325ef77fd22c2baeea1bc45469b14c597a1
--- /dev/null
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LookupTableV2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");      // int tensor
+    auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
+    auto *table_t = ctx.Input<framework::LoDTensor>("W");
+
+    auto *table_var = ctx.InputVar("W");
+    PADDLE_ENFORCE_EQ(
+        table_var->IsType<framework::LoDTensor>(), true,
+        platform::errors::InvalidArgument("npu only accept LoDTensor"));
+    output_t->mutable_data<T>(ctx.GetPlace());
+    framework::NPUAttributeMap attr_input = {{"validate_indices", false}};
+
+    auto runner =
+        NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
+    auto *output_grad_t =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto *table_grad_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
+    table_grad_t->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner_zeros =
+        NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
+    runner_zeros.Run(stream);
+
+    // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+    // can be different tensor, but in cann 20.2+, it does inplace operation.
+    // Thus, the first input and output should be same tensor.
+    auto runner_scatter =
+        NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
+                    {*table_grad_t}, {{"use_locking", true}});
+    runner_scatter.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    lookup_table_v2,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext,
+                                paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel<float>,
+    ops::LookupTableV2GradNPUKernel<int>,
+    ops::LookupTableV2GradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 4847c1f05b094856711afbb58222e8ecfd580894..05d42f02c1003af2d1efd1e642860ae7e5b5ba01 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -15,18 +15,33 @@
 #ifdef PADDLE_WITH_MKLML
 #include <mkl.h>
 #endif
+
 #include <algorithm>
 #include <cmath>
 #include <limits>
 #include <vector>
 
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
+namespace detail {
+
+template <typename T>
+static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
+                 const int incy) {
+  // Y = Y + alpha * X
+  while (n-- > 0) {
+    *y += alpha * *x;
+    y = y + incy;
+    x = x + incx;
+  }
+}
+}  // namespace detail
 
 template <typename T>
 struct CBlas;
@@ -40,6 +55,21 @@ struct CBlas<int8_t> {
   }
 };
 
+template <>
+struct CBlas<platform::bfloat16> {
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    detail::axpy(args...);
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Blas VCOPY do not supported on CPU with bfloat16,"
+        " please check your code"));
+  }
+};
+
 #ifdef PADDLE_WITH_MKLML
 template <>
 struct CBlas<float> {
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index a29997e56547044c7ca9be376a1d9bc079e845b8..d62c1e42d3bc44c7e028201f93948e0c227ee53e 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -114,8 +114,8 @@ __global__ void ConcatKernel(const T** inputs_data, const int in_num,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int* out_cols,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t* out_cols,
                             int out_cols_size, T** outputs_data) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   int curr_segment = 0;
@@ -159,15 +159,15 @@ __device__ void SplitKernelDetail(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T** outputs_data) {
   SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T* outputs_addr0, T* outputs_addr1) {
   T* outputs_data[2];
   outputs_data[0] = outputs_addr0;
@@ -176,8 +176,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T* outputs_addr0, T* outputs_addr1,
                             T* outputs_addr2) {
   T* outputs_data[3];
@@ -188,8 +188,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 template <typename T>
-__global__ void SplitKernel(const T* input_data, const int in_row,
-                            const int in_col, const int fixed_out_col,
+__global__ void SplitKernel(const T* input_data, const int64_t in_row,
+                            const int64_t in_col, const int64_t fixed_out_col,
                             T* outputs_addr0, T* outputs_addr1,
                             T* outputs_addr2, T* outputs_addr3) {
   T* outputs_data[4];
@@ -201,8 +201,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
 }
 
 static inline void GetBlockDims(const platform::CUDADeviceContext& context,
-                                int num_rows, int num_cols, dim3* block_dims,
-                                dim3* grid_dims) {
+                                int64_t num_rows, int64_t num_cols,
+                                dim3* block_dims, dim3* grid_dims) {
   // Set the thread block and grid according to CurrentDeviceId
   const int kThreadsPerBlock = 1024;
   int block_cols = kThreadsPerBlock;
@@ -213,12 +213,12 @@ static inline void GetBlockDims(const platform::CUDADeviceContext& context,
   *block_dims = dim3(block_cols, block_rows, 1);
 
   int max_threads = context.GetMaxPhysicalThreadCount();
-  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+  int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
 
   int grid_cols =
       std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows =
-      std::min(max_blocks / grid_cols, std::max(num_rows / block_rows, 1));
+  int grid_rows = std::min(max_blocks / grid_cols,
+                           std::max(num_rows / block_rows, (int64_t)1));
   *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
 
@@ -319,22 +319,22 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
                   int axis, std::vector<framework::Tensor*>* outputs) {
     // TODO(zcd): Add input data validity checking
     int o_num = outputs->size();
-    int out_row = 1;
+    int64_t out_row = 1;
     auto dim_0 = ref_inputs[0]->dims();
     for (int i = 0; i < axis; ++i) {
       out_row *= dim_0[i];
     }
 
-    int out0_col = ref_inputs[0]->numel() / out_row;
-    int in_col = 0, in_row = out_row;
+    int64_t out0_col = ref_inputs[0]->numel() / out_row;
+    int64_t in_col = 0, in_row = out_row;
     bool has_same_shape = true;
 
     std::vector<T*> outputs_data(o_num);
-    std::vector<int> outputs_cols(o_num + 1);
+    std::vector<int64_t> outputs_cols(o_num + 1);
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
-      int t_col = ref_inputs.at(i)->numel() / out_row;
+      int64_t t_col = ref_inputs.at(i)->numel() / out_row;
       if (has_same_shape) {
         if (t_col != out0_col) has_same_shape = false;
       }
@@ -384,13 +384,13 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context,
 
-                        outputs_cols.size() * sizeof(int));
+                        outputs_cols.size() * sizeof(int64_t));
       memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
                    tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
                    reinterpret_cast<void*>(outputs_cols.data()),
-                   outputs_cols.size() * sizeof(int), context.stream());
-      int* dev_outs_col_data =
-          reinterpret_cast<int*>(tmp_dev_ins_col_data->ptr());
+                   outputs_cols.size() * sizeof(int64_t), context.stream());
+      int64_t* dev_outs_col_data =
+          reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
       SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
           input.data<T>(), in_row, in_col, dev_outs_col_data,
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 84fa0d6af990e22083ec1a0e3993893cefad1ab5..55662e1d0aad7a1edee800bc2875ea5574b45e6e 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -66,18 +66,23 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 
     int batch_size = prob->dims()[0];
     int class_num = prob->dims()[1];
+#ifdef __HIPCC__
+    constexpr int kMaxBlockDim = 256;
+#else
+    constexpr int kMaxBlockDim = 512;
+#endif
 
     if (softLabel) {
       const T* label_data = labels->data<T>();
-      int block = class_num > 512
-                      ? 512
+      int block = class_num > kMaxBlockDim
+                      ? kMaxBlockDim
                       : pow(2, static_cast<int>(std::log2(class_num)));
 
       SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, class_num);
     } else {
       const int64_t* label_data = labels->data<int64_t>();
-      int block = 512;
+      int block = kMaxBlockDim;
       int grid = (batch_size + block - 1) / block;
       CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, batch_size, class_num,
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 7439a959d382852b631b1a7359387ed9764810d0..7c5f59fab0d280587e15b6e1353c9dd2bda9270a 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -22,6 +22,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -52,8 +53,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
       const int filter_multiplier, const int filter_height,                    \
       const int filter_width, const int stride_height, const int stride_width, \
       const int padding_height, const int padding_width,                       \
-      const int dilate_height, const int dilate_width, T *const output_data,   \
-      const DataLayout data_layout = DataLayout::kNCHW
+      const int dilate_height, const int dilate_width, T *const output_data
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
@@ -123,7 +123,6 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
   const int batch = idx / output_width / output_height / output_channels;
 
   const int c_in = c_out / filter_multiplier;
-  const T* weight = filter_data + c_out * filter_height * filter_width;
   T value = 0;
   const int h_in_start = -padding_height + h_out * stride_height;
   const int w_in_start = -padding_width + w_out * stride_width;
@@ -142,13 +141,14 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
     for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
       if (h_in >= h_start && h_in < h_end && w_in >= w_start && w_in < w_end) {
         int offset = ((batch * input_height + h_in) * input_width + w_in) *
-                         output_channels +
+                         input_channels +
                      c_in;
         T in_data = input_data[offset];
+        const T* weight = filter_data + weight_offset * output_channels + c_out;
         if (fuse_relu_before_conv) {
-          value += weight[weight_offset] * max(0.0f, in_data);
+          value += weight[0] * max(0.0f, in_data);
         } else {
-          value += weight[weight_offset] * in_data;
+          value += weight[0] * in_data;
         }
       }
       weight_offset++;
@@ -161,10 +161,10 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
 }
 
 template <typename T, int c_filter, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvCFilter(
+__device__ __inline__ void KernelDepthwiseConvCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConv) {
-  const int kWeghtSize = c_filter * c_filter;
-  T r_weight[kWeghtSize];
+  const int kWeightSize = c_filter * c_filter;
+  T r_weight[kWeightSize];
   const int batch = blockIdx.y;
   const int c_out = blockIdx.x;
   const T* weight = filter_data + c_out * c_filter * c_filter;
@@ -182,13 +182,8 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
       const int h_in_end = h_in_start + c_filter * dilate_height;
       const int w_in_end = w_in_start + c_filter * dilate_width;
 
-      int in_offset;
-      if (data_layout != DataLayout::kNHWC) {
-        in_offset =
-            ((batch * input_channels + c_in) * input_height) * input_width;
-      } else {
-        in_offset = batch * input_height * input_width * input_channels;
-      }
+      int in_offset =
+          ((batch * input_channels + c_in) * input_height) * input_width;
 
       const int h_end = h_in_end < input_height ? h_in_end : input_height;
       const int w_end = w_in_end < input_width ? w_in_end : input_width;
@@ -201,13 +196,63 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
              w_in += dilate_width, w_f++) {
           if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
               w_in < input_width) {
-            int offset;
-            if (data_layout != DataLayout::kNHWC) {
-              offset = in_offset + h_in * input_width + w_in;
+            int offset = in_offset + h_in * input_width + w_in;
+            if (fuse_relu_before_conv) {
+              value += r_weight[h_f * c_filter + w_f] *
+                       max(0.0f, input_data[offset]);
             } else {
-              offset = in_offset +
-                       (h_in * input_width + w_in) * input_channels + c_in;
+              value += r_weight[h_f * c_filter + w_f] * input_data[offset];
             }
+          }
+        }
+      }
+      int index =
+          ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
+          w_out;
+      output_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
+    ARG_DEFINE_KernelDepthwiseConv) {
+  const int batch = blockIdx.z;
+  int h_out = blockIdx.x * dilate_height + blockIdx.y;
+  if (h_out >= output_height) {
+    return;
+  }
+  int in_offset = batch * input_height * input_width * input_channels;
+  int out_offset =
+      (batch * output_height + h_out) * output_width * output_channels;
+  const int h_in_start = -padding_height + h_out * stride_height;
+  const int wi_size = (output_width + dilate_width - 1) / dilate_width;
+  const int kWeightSize = c_filter * c_filter;
+  T r_weight[kWeightSize];
+
+  for (int c_out = threadIdx.x; c_out < output_channels; c_out += blockDim.x) {
+    for (int i = 0; i < c_filter * c_filter; i++) {
+      const T* weight = filter_data + i * output_channels + c_out;
+      r_weight[i] = weight[0];
+    }
+    const int c_in = c_out / filter_multiplier;
+    for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
+      int i_dw = i / wi_size;
+      int i_wi = i - i_dw * wi_size;
+      int w_out = i_wi * dilate_width + i_dw;
+      if (w_out >= output_width) {
+        continue;
+      }
+      T value = 0;
+      const int w_in_start = -padding_width + w_out * stride_width;
+      for (int h_in = h_in_start, h_f = 0; h_f < c_filter;
+           h_in += dilate_height, h_f++) {
+        for (int w_in = w_in_start, w_f = 0; w_f < c_filter;
+             w_in += dilate_width, w_f++) {
+          if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
+              w_in < input_width) {
+            int offset =
+                in_offset + (h_in * input_width + w_in) * input_channels + c_in;
             if (fuse_relu_before_conv) {
               value += r_weight[h_f * c_filter + w_f] *
                        max(0.0f, input_data[offset]);
@@ -217,23 +262,14 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
           }
         }
       }
-      int index;
-      if (data_layout != DataLayout::kNHWC) {
-        index = ((batch * gridDim.x + c_out) * output_height + h_out) *
-                    output_width +
-                w_out;
-      } else {
-        index = ((batch * output_height + h_out) * output_width + w_out) *
-                    gridDim.x +
-                c_out;
-      }
+      int index = out_offset + w_out * output_channels + c_out;
       output_data[index] = value;
     }
   }
 }
 
 template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          bool fuse_relu_before_conv>
+          DataLayout data_layout, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   int final_filter_multiplier = filter_multiplier;
   int h_stride = stride_height;
@@ -244,28 +280,37 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
     w_stride = c_stride;
   }
   if (c_filter == -1) {
-    if (data_layout == DataLayout::kNCHW) {
+    if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           final_filter_multiplier, filter_height, filter_width, h_stride,
           w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data, data_layout);
+          output_data);
     } else {
       KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           final_filter_multiplier, filter_height, filter_width, h_stride,
           w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data, data_layout);
+          output_data);
     }
   } else {
-    KernelDepthwiseConvCFilter<T, c_filter, fuse_relu_before_conv>(
-        input_data, filter_data, batch_size, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        final_filter_multiplier, filter_height, filter_width, h_stride,
-        w_stride, padding_height, padding_width, dilate_height, dilate_width,
-        output_data, data_layout);
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvCFilterNCHW<T, c_filter, fuse_relu_before_conv>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
+    } else {
+      KernelDepthwiseConvCFilterNHWC<T, c_filter, fuse_relu_before_conv>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
+    }
   }
 }
 
@@ -280,40 +325,27 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
       const int filter_width, const int stride_height, const int stride_width, \
       const int padding_height, const int padding_width,                       \
       const int dilate_height, const int dilate_width,                         \
-      T *const input_grad_data,                                                \
-      const DataLayout data_layout = DataLayout::kNCHW
+      T *const input_grad_data
 
 template <typename T, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvInputGrad(
+__device__ __inline__ void KernelDepthwiseConvInputGradNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  const int batch = blockIdx.y;
+  const int c_in = blockIdx.x;
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
     for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_in = blockIdx.x;
-
       const int c_out_start = c_in * filter_multiplier;
-
       int h_out_start =
           h_in - (filter_height - 1) * dilate_height + padding_height;
-
       int h_out_end = h_in + padding_height;
-
       int w_out_start =
           w_in - (filter_width - 1) * dilate_width + padding_width;
-
       int w_out_end = w_in + padding_width;
 
       T value = 0;
-      int index;
-      if (data_layout != DataLayout::kNHWC) {
-        index =
-            ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-            w_in;
-      } else {
-        index =
-            ((batch * input_height + h_in) * input_width + w_in) * gridDim.x +
-            c_in;
-      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
 
       if (fuse_relu_before_conv) {
         if (input_data[index] <= 0) {
@@ -335,20 +367,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
             if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
                 s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
                 s_w_out < output_width) {
-              int output_grad_offset;
-              if (data_layout != DataLayout::kNHWC) {
-                output_grad_offset =
-                    ((batch * output_channels + c_out) * output_height +
-                     s_h_out) *
-                        output_width +
-                    s_w_out;
-              } else {
-                output_grad_offset =
-                    ((batch * output_height + s_h_out) * output_width +
-                     s_w_out) *
-                        output_channels +
-                    c_out;
-              }
+              int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
+                   s_h_out) *
+                      output_width +
+                  s_w_out;
+              value += output_grad_data[output_grad_offset] *
+                       filter_data[filter_offset];
+            }
+          }
+        }
+      }
+      input_grad_data[index] = value;
+    }
+  }
+}
+
+template <typename T, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  const int batch = blockIdx.z;
+  int h_in = blockIdx.x * dilate_height + blockIdx.y;
+  if (h_in >= input_height) {
+    return;
+  }
+
+  for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) {
+    for (int w_in = threadIdx.y; w_in < input_width; w_in += blockDim.y) {
+      int h_out_start =
+          h_in - (filter_height - 1) * dilate_height + padding_height;
+      int w_out_start =
+          w_in - (filter_width - 1) * dilate_width + padding_width;
+
+      T value = 0;
+      int index = ((batch * input_height + h_in) * input_width + w_in) *
+                      input_channels +
+                  c_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
+
+      for (int c_i = 0; c_i < filter_multiplier; c_i++) {
+        int c_out = c_in * filter_multiplier + c_i;
+        int weight_offset = filter_height * filter_width;
+        for (int h_out = h_out_start, h_f = 0; h_f < filter_height;
+             h_out += dilate_height, h_f++) {
+          for (int w_out = w_out_start, w_f = 0; w_f < filter_width;
+               w_out += dilate_width, w_f++) {
+            weight_offset--;
+            int s_h_out = h_out / stride_height;
+            int s_w_out = w_out / stride_width;
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                s_w_out < output_width) {
+              int output_grad_offset =
+                  ((batch * output_height + s_h_out) * output_width + s_w_out) *
+                      output_channels +
+                  c_out;
+              int filter_offset = weight_offset * output_channels + c_out;
               value += output_grad_data[output_grad_offset] *
                        filter_data[filter_offset];
             }
@@ -362,10 +441,10 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
 
 template <typename T, int c_filter, int c_filter_multiplier,
           bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
+__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1;
-  T r_weight[kWeghtSize];
+  const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1;
+  T r_weight[kWeightSize];
   const int batch = blockIdx.y;
   const int c_in = blockIdx.x;
 
@@ -379,24 +458,13 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
 
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
     for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_in = blockIdx.x;
-
       int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height;
-
       int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
 
       T value = 0;
-      int index;
-      if (data_layout != DataLayout::kNHWC) {
-        index =
-            ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-            w_in;
-      } else {
-        index =
-            ((batch * input_height + h_in) * input_width + w_in) * gridDim.x +
-            c_in;
-      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
       if (fuse_relu_before_conv) {
         if (input_data[index] <= 0) {
           input_grad_data[index] = 0;
@@ -415,20 +483,11 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
             if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
                 s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
                 s_w_out < output_width) {
-              int output_grad_offset;
-              if (data_layout != DataLayout::kNHWC) {
-                output_grad_offset =
-                    ((batch * output_channels + c_out) * output_height +
-                     s_h_out) *
-                        output_width +
-                    s_w_out;
-              } else {
-                output_grad_offset =
-                    ((batch * output_height + s_h_out) * output_width +
-                     s_w_out) *
-                        output_channels +
-                    c_out;
-              }
+              int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
+                   s_h_out) *
+                      output_width +
+                  s_w_out;
               value +=
                   output_grad_data[output_grad_offset] *
                   r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
@@ -441,47 +500,137 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+template <typename T, int c_filter, int c_filter_multiplier,
           bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  int h_in = blockIdx.x * dilate_height + blockIdx.y;
+  if (h_in >= input_height) {
+    return;
+  }
+  const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1;
+  T r_weight[kWeightSize];
+  const int batch = blockIdx.z;
+  const int wi_size = (input_width + dilate_width - 1) / dilate_width;
+  const int h_out_start =
+      h_in - (c_filter - 1) * dilate_height + padding_height;
+
+  for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) {
+    for (int c_i = 0; c_i < c_filter_multiplier; c_i++) {
+      int c_out = c_in * c_filter_multiplier + c_i;
+      for (int i = 0; i < c_filter * c_filter; i++)
+        r_weight[i + c_i * c_filter * c_filter] =
+            filter_data[(c_filter * c_filter - i - 1) * output_channels +
+                        c_out];
+    }
+    for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
+      int i_dw = i / wi_size;
+      int i_wi = i - i_dw * wi_size;
+      int w_in = i_wi * dilate_width + i_dw;
+      if (w_in >= input_width) {
+        continue;
+      }
+      int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
+
+      T value = 0;
+      int index = ((batch * input_height + h_in) * input_width + w_in) *
+                      input_channels +
+                  c_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
+
+      for (int c_i = 0; c_i < c_filter_multiplier; c_i++) {
+        int c_out = c_in * c_filter_multiplier + c_i;
+        for (int h_out = h_out_start, h_f = 0; h_f < c_filter;
+             h_out += dilate_height, h_f++) {
+          for (int w_out = w_out_start, w_f = 0; w_f < c_filter;
+               w_out += dilate_width, w_f++) {
+            int s_h_out = h_out / stride_height;
+            int s_w_out = w_out / stride_width;
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                s_w_out < output_width) {
+              int output_grad_offset =
+                  ((batch * output_height + s_h_out) * output_width + s_w_out) *
+                      output_channels +
+                  c_out;
+              value +=
+                  output_grad_data[output_grad_offset] *
+                  r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
+            }
+          }
+        }
+      }
+      input_grad_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          DataLayout data_layout, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvInputGradSp(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, input_grad_data, data_layout);
-  else if (c_filter == -1)
-    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
-        padding_height, padding_width, dilate_height, dilate_width,
-        input_grad_data, data_layout);
-  else
-    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier,
-                                        fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
-        padding_height, padding_width, dilate_height, dilate_width,
-        input_grad_data, data_layout);
+  int final_filter_multiplier = filter_multiplier;
+  int h_stride = stride_height;
+  int w_stride = stride_width;
+  if (c_filter_multiplier != 0) {
+    final_filter_multiplier = c_filter_multiplier;
+    h_stride = c_stride;
+    w_stride = c_stride;
+  }
+
+  if (c_filter_multiplier == 0 || c_filter == -1) {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, final_filter_multiplier, filter_height,
+          filter_width, h_stride, w_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    } else {
+      KernelDepthwiseConvInputGradNHWC<T, fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, final_filter_multiplier, filter_height,
+          filter_width, h_stride, w_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    }
+  } else {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvInputGradCFilterNCHW<T, c_filter, c_filter_multiplier,
+                                              fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, c_filter_multiplier, filter_height,
+          filter_width, c_stride, c_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    } else {
+      KernelDepthwiseConvInputGradCFilterNHWC<T, c_filter, c_filter_multiplier,
+                                              fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, c_filter_multiplier, filter_height,
+          filter_width, c_stride, c_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    }
+  }
 }
 
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvFilterGrad(
+__device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
     const int input_channels, const int input_height, const int input_width,
     const int filter_multiplier, const int filter_height,
     const int filter_width, const int stride_height, const int stride_width,
     const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data,
-    const DataLayout data_layout = DataLayout::kNCHW) {
+    const int dilate_width, T* filter_grad_data) {
   T s = 0;
-
   int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
 
   for (int image_w = threadIdx.x; image_w < output_width;
@@ -499,45 +648,137 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad(
         if (image_wk < 0 || image_wk >= input_width) continue;
 #define gaid(N, C, H, W) \
   ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
-#define gaid_nhwc(N, H, W, C) \
-  ((((N)*output_height + (H)) * output_width + (W)) * gridDim.z + (C))
-        int input_id;
-        if (data_layout != DataLayout::kNHWC) {
-          input_id = ((bid * (gridDim.z / filter_multiplier) +
-                       kernel_id / filter_multiplier) *
-                          input_height +
-                      image_hk) *
-                         input_width +
-                     image_wk;
-          if (fuse_relu_before_conv) {
-            s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-                 max(0.0f, input_data[input_id]);
-          } else {
-            s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-                 input_data[input_id];
-          }
+        int input_id = ((bid * (gridDim.z / filter_multiplier) +
+                         kernel_id / filter_multiplier) *
+                            input_height +
+                        image_hk) *
+                           input_width +
+                       image_wk;
+        if (fuse_relu_before_conv) {
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               max(0.0f, input_data[input_id]);
         } else {
-          input_id =
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               input_data[input_id];
+        }
+#undef gaid
+      }
+    }
+  }
+  CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
+}
+
+template <typename T, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
+    const T* output_grad_data, const T* input_data, const int num,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* filter_grad_data) {
+  int bid = blockIdx.z;
+  int image_h = blockIdx.y;
+  int kernel_iw = blockIdx.x % filter_width;
+  int kernel_ih = blockIdx.x / filter_width;
+  for (int kernel_id = threadIdx.x; kernel_id < output_channels;
+       kernel_id += blockDim.x) {
+    T s = 0;
+    int gbid =
+        ((kernel_id * filter_height) + kernel_ih) * filter_width + kernel_iw;
+    for (int image_w = threadIdx.y; image_w < output_width;
+         image_w += blockDim.y) {
+      int kernel_h = kernel_ih * dilate_height - padding_height;
+      int kernel_w = kernel_iw * dilate_width - padding_width;
+
+      int image_hk = image_h * stride_height + kernel_h;
+      int image_wk = image_w * stride_width + kernel_w;
+      if (image_hk < 0 || image_hk >= input_height) continue;
+      if (image_wk < 0 || image_wk >= input_width) continue;
+#define gaid(N, H, W, C) \
+  ((((N)*output_height + (H)) * output_width + (W)) * output_channels + (C))
+      int input_id =
+          ((bid * input_height + image_hk) * input_width + image_wk) *
+              input_channels +
+          kernel_id / filter_multiplier;
+      if (fuse_relu_before_conv) {
+        s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
+             max(0.0f, input_data[input_id]);
+      } else {
+        s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
+             input_data[input_id];
+      }
+#undef gaid
+    }
+    platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
+  }
+}
+
+template <typename T, int c_filter, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
+    const T* output_grad_data, const T* input_data, const int num,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* filter_grad_data) {
+  const int bid = blockIdx.z;
+  int image_h = blockIdx.x * dilate_height + blockIdx.y;
+  if (image_h >= output_height) {
+    return;
+  }
+  const int kWeightSize = c_filter * c_filter;
+  T r_weight[kWeightSize];
+  const int wi_size = (output_width + dilate_width - 1) / dilate_width;
+
+  for (int kernel_id = threadIdx.x; kernel_id < output_channels;
+       kernel_id += blockDim.x) {
+    for (int i = 0; i < c_filter * c_filter; ++i) {
+      r_weight[i] = 0;
+    }
+    for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
+      int i_dw = i / wi_size;
+      int i_wi = i - i_dw * wi_size;
+      int image_w = i_wi * dilate_width + i_dw;
+      if (image_w >= output_width) {
+        continue;
+      }
+      for (int kernel_ih = 0; kernel_ih < c_filter; ++kernel_ih) {
+        for (int kernel_iw = 0; kernel_iw < c_filter; ++kernel_iw) {
+          int kernel_h = kernel_ih * dilate_height - padding_height;
+          int kernel_w = kernel_iw * dilate_width - padding_width;
+          int image_hk = image_h * stride_height + kernel_h;
+          int image_wk = image_w * stride_width + kernel_w;
+          if (image_hk < 0 || image_hk >= input_height) continue;
+          if (image_wk < 0 || image_wk >= input_width) continue;
+          int input_id =
               ((bid * input_height + image_hk) * input_width + image_wk) *
-                  (gridDim.z / filter_multiplier) +
+                  input_channels +
               kernel_id / filter_multiplier;
+          int output_id =
+              ((bid * output_height + image_h) * output_width + image_w) *
+                  output_channels +
+              kernel_id;
+          T s = 0;
           if (fuse_relu_before_conv) {
-            s += output_grad_data[gaid_nhwc(bid, image_h, image_w, kernel_id)] *
-                 max(0.0f, input_data[input_id]);
+            s = output_grad_data[output_id] * max(0.0f, input_data[input_id]);
           } else {
-            s += output_grad_data[gaid_nhwc(bid, image_h, image_w, kernel_id)] *
-                 input_data[input_id];
+            s = output_grad_data[output_id] * input_data[input_id];
           }
+          r_weight[kernel_ih * c_filter + kernel_iw] += s;
         }
-
-#undef gaid
       }
     }
+    for (int i = 0; i < c_filter * c_filter; ++i) {
+      T* weight = filter_grad_data + i * output_channels + kernel_id;
+      platform::CudaAtomicAdd(&weight[0], r_weight[i]);
+    }
   }
-  CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
 }
 
-template <typename T, int c_filter_multiplier, bool fuse_relu_before_conv>
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          DataLayout data_layout, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvFilterGradSp(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
@@ -545,22 +786,49 @@ __global__ void KernelDepthwiseConvFilterGradSp(
     const int filter_multiplier, const int filter_height,
     const int filter_width, const int stride_height, const int stride_width,
     const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data,
-    const DataLayout data_layout = DataLayout::kNCHW) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
-        output_grad_data, input_data, num, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, filter_grad_data, data_layout);
-  else
-    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
-        output_grad_data, input_data, num, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, filter_grad_data, data_layout);
+    const int dilate_width, T* filter_grad_data) {
+  int final_filter_multiplier = filter_multiplier;
+  int h_stride = stride_height;
+  int w_stride = stride_width;
+  if (c_filter_multiplier != 0) {
+    final_filter_multiplier = c_filter_multiplier;
+    h_stride = c_stride;
+    w_stride = c_stride;
+  }
+  if (c_filter_multiplier == 0 || c_filter == -1) {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    } else {
+      KernelDepthwiseConvFilterGradNHWC<T, fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    }
+  } else {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    } else {
+      KernelDepthwiseConvFilterGradCFilterNHWC<T, c_filter,
+                                               fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    }
+  }
 }
 
 /*
@@ -608,40 +876,85 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
     const T* filter_data = filter.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
+    framework::Tensor filter_hwc;
+    if (data_layout == DataLayout::kNHWC) {
+      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
+                                       filter.dims()[0], filter.dims()[1]});
+      filter_hwc.Resize(filter_hwc_dims);
+      filter_hwc.mutable_data<T>(context.GetPlace());
+      std::vector<int> perm_axis({2, 3, 0, 1});
+      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      trans(context, filter, &filter_hwc, perm_axis);
+      filter_data = filter_hwc.data<T>();
+    }
+
     int thread = 512;
-    if (output_width > 1024 && output_width <= 2048)
-      thread = (output_width - 1) / 2 + 1;
-    else if (output_width > 512 && output_width <= 1024)
-      thread = output_width;
-    int blocks = std::min(std::max(thread / output_width, 1), output_height);
-    dim3 threads(std::min(output_width, thread), blocks, 1);
-    dim3 grid(output_channels, batch_size, 1);
+    int blocks;
+    dim3 threads;
+    dim3 grid;
+    if (data_layout != DataLayout::kNHWC) {
+      if (output_width > 1024 && output_width <= 2048)
+        thread = (output_width - 1) / 2 + 1;
+      else if (output_width > 512 && output_width <= 1024)
+        thread = output_width;
+#ifdef __HIPCC__
+      thread = std::min(thread, 256);
+#endif
+      blocks = std::min(std::max(thread / output_width, 1), output_height);
+      threads = dim3(std::min(output_width, thread), blocks, 1);
+      grid = dim3(output_channels, batch_size, 1);
+    } else {
+#ifdef __HIPCC__
+      thread = std::min(thread, 256);
+#endif
+      blocks = std::min(
+          std::max(thread / output_channels, 1),
+          ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
+      threads = dim3(std::min(output_channels, thread), blocks, 1);
+      grid = dim3((output_height + dilate_height - 1) / dilate_height,
+                  dilate_height, batch_size);
+    }
     int filter_multiplier = output_channels / input_channels;
-
     int nums_output =
         batch_size * output_channels * output_height * output_width;
+#ifdef __HIPCC__
+    int block_size = 256;
+#else
     int block_size = 512;
-
-#define check_case(c_filter_multiplier, c_stride, c_filter)                  \
-  if (c_filter_multiplier == 0 ||                                            \
-      filter_multiplier == c_filter_multiplier &&                            \
-          stride_height == stride_width && stride_height == c_stride &&      \
-          (ksize_height == ksize_width && ksize_height == c_filter ||        \
-           c_filter == -1)) {                                                \
-    if (c_filter == -1) {                                                    \
-      threads.x = block_size;                                                \
-      grid.x = (nums_output + block_size - 1) / block_size;                  \
-      threads.y = threads.z = grid.y = grid.z = 1;                           \
-    }                                                                        \
-    KernelDepthwiseConvSp<                                                   \
-        T, c_filter_multiplier, c_stride, c_filter,                          \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-        input_data, filter_data, batch_size, output_channels, output_height, \
-        output_width, input_channels, input_height, input_width,             \
-        filter_multiplier, ksize_height, ksize_width, stride_height,         \
-        stride_width, padding_height, padding_width, dilate_height,          \
-        dilate_width, output_data, data_layout);                             \
-    return;                                                                  \
+#endif
+    int grid_size = (nums_output + block_size - 1) / block_size;
+
+#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
+  if (c_filter_multiplier == 0 ||                                              \
+      filter_multiplier == c_filter_multiplier &&                              \
+          stride_height == stride_width && stride_height == c_stride &&        \
+          (ksize_height == ksize_width && ksize_height == c_filter ||          \
+           c_filter == -1)) {                                                  \
+    if (c_filter == -1) {                                                      \
+      threads.x = block_size;                                                  \
+      grid.x = grid_size;                                                      \
+      threads.y = threads.z = grid.y = grid.z = 1;                             \
+    }                                                                          \
+    if (data_layout != DataLayout::kNHWC) {                                    \
+      KernelDepthwiseConvSp<                                                   \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          input_data, filter_data, batch_size, output_channels, output_height, \
+          output_width, input_channels, input_height, input_width,             \
+          filter_multiplier, ksize_height, ksize_width, stride_height,         \
+          stride_width, padding_height, padding_width, dilate_height,          \
+          dilate_width, output_data);                                          \
+    } else {                                                                   \
+      KernelDepthwiseConvSp<                                                   \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          input_data, filter_data, batch_size, output_channels, output_height, \
+          output_width, input_channels, input_height, input_width,             \
+          filter_multiplier, ksize_height, ksize_width, stride_height,         \
+          stride_width, padding_height, padding_width, dilate_height,          \
+          dilate_width, output_data);                                          \
+    }                                                                          \
+    return;                                                                    \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -705,32 +1018,67 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
+    framework::Tensor filter_hwc;
+    if (data_layout == DataLayout::kNHWC) {
+      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
+                                       filter.dims()[0], filter.dims()[1]});
+      filter_hwc.Resize(filter_hwc_dims);
+      filter_hwc.mutable_data<T>(context.GetPlace());
+      std::vector<int> perm_axis({2, 3, 0, 1});
+      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      trans(context, filter, &filter_hwc, perm_axis);
+      filter_data = filter_hwc.data<T>();
+    }
+
     int thread = 512;
-    if (input_width > 1024 && input_width <= 2048)
-      thread = (input_width - 1) / 2 + 1;
-    else if (input_width > 512 && input_width <= 1024)
-      thread = input_width;
-    int blocks = std::min(std::max(thread / input_width, 1), input_height);
-    dim3 threads(std::min(input_width, thread), blocks, 1);
-    dim3 grid(input_channels, batch_size, 1);
+    int blocks;
+    dim3 threads;
+    dim3 grid;
+    if (data_layout != DataLayout::kNHWC) {
+      if (input_width > 1024 && input_width <= 2048) {
+        thread = (input_width - 1) / 2 + 1;
+      } else if (input_width > 512 && input_width <= 1024) {
+        thread = input_width;
+      }
+      blocks = std::min(std::max(thread / input_width, 1), input_height);
+      threads = dim3(std::min(input_width, thread), blocks, 1);
+      grid = dim3(input_channels, batch_size, 1);
+    } else {
+      blocks = std::min(
+          std::max(thread / input_channels, 1),
+          ((input_width + dilate_width - 1) / dilate_width) * dilate_width);
+      threads = dim3(std::min(input_channels, thread), blocks, 1);
+      grid = dim3((input_height + dilate_height - 1) / dilate_height,
+                  dilate_height, batch_size);
+    }
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)             \
-  if (c_filter_multiplier == 0 ||                                       \
-      filter_multiplier == c_filter_multiplier &&                       \
-          stride_height == stride_width && stride_height == c_stride && \
-          (ksize_height == ksize_width && ksize_height == c_filter ||   \
-           c_filter == -1)) {                                           \
-    KernelDepthwiseConvInputGradSp<                                     \
-        T, c_filter_multiplier, c_stride, c_filter,                     \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-        input_data, output_grad_data, filter_data, batch_size,          \
-        output_channels, output_height, output_width, input_channels,   \
-        input_height, input_width, filter_multiplier, ksize_height,     \
-        ksize_width, stride_height, stride_width, padding_height,       \
-        padding_width, dilate_height, dilate_width, input_grad_data,    \
-        data_layout);                                                   \
-    return;                                                             \
+#define check_case(c_filter_multiplier, c_stride, c_filter)               \
+  if (c_filter_multiplier == 0 ||                                         \
+      filter_multiplier == c_filter_multiplier &&                         \
+          stride_height == stride_width && stride_height == c_stride &&   \
+          (ksize_height == ksize_width && ksize_height == c_filter ||     \
+           c_filter == -1)) {                                             \
+    if (data_layout != DataLayout::kNHWC) {                               \
+      KernelDepthwiseConvInputGradSp<                                     \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,  \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data, output_grad_data, filter_data, batch_size,          \
+          output_channels, output_height, output_width, input_channels,   \
+          input_height, input_width, filter_multiplier, ksize_height,     \
+          ksize_width, stride_height, stride_width, padding_height,       \
+          padding_width, dilate_height, dilate_width, input_grad_data);   \
+    } else {                                                              \
+      KernelDepthwiseConvInputGradSp<                                     \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,  \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data, output_grad_data, filter_data, batch_size,          \
+          output_channels, output_height, output_width, input_channels,   \
+          input_height, input_width, filter_multiplier, ksize_height,     \
+          ksize_width, stride_height, stride_width, padding_height,       \
+          padding_width, dilate_height, dilate_width, input_grad_data);   \
+    }                                                                     \
+    return;                                                               \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -793,30 +1141,95 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
     T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
 
     int block_size = 512;
-    if (output_width > 1024 && output_width <= 2048)
-      block_size = (output_width - 1) / 2 + 1;
-    else if (output_width > 512 && output_width <= 1024)
-      block_size = output_width;
-    int crop_output_height =
-        std::min(std::max(block_size / output_width, 1), output_height);
-    dim3 grid(ksize_width, ksize_height, output_channels);
-    dim3 threads(std::min(output_width, block_size), crop_output_height, 1);
+    int blocks;
+    dim3 threads;
+    dim3 grid;
+    if (data_layout != DataLayout::kNHWC) {
+      if (output_width > 1024 && output_width <= 2048) {
+        block_size = (output_width - 1) / 2 + 1;
+      } else if (output_width > 512 && output_width <= 1024) {
+        block_size = output_width;
+      }
+      blocks = std::min(std::max(block_size / output_width, 1), output_height);
+      grid = dim3(ksize_width, ksize_height, output_channels);
+      threads = dim3(std::min(output_width, block_size), blocks, 1);
+    } else {
+      blocks = std::min(
+          std::max(block_size / output_channels, 1),
+          ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
+      grid = dim3((output_height + dilate_height - 1) / dilate_height,
+                  dilate_height, batch_size);
+      threads = dim3(std::min(output_channels, block_size), blocks, 1);
+    }
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier)                                       \
-  if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
-    KernelDepthwiseConvFilterGradSp<                                          \
-        T, c_filter_multiplier,                                               \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(       \
-        output_grad_data, input_data, batch_size, output_channels,            \
-        output_height, output_width, input_channels, input_height,            \
-        input_width, filter_multiplier, ksize_height, ksize_width,            \
-        stride_height, stride_width, padding_height, padding_width,           \
-        dilate_height, dilate_width, filter_grad_data, data_layout);          \
-    return;                                                                   \
+#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
+  if (c_filter_multiplier == 0 ||                                              \
+      filter_multiplier == c_filter_multiplier &&                              \
+          stride_height == stride_width && stride_height == c_stride &&        \
+          (ksize_height == ksize_width && ksize_height == c_filter ||          \
+           c_filter == -1)) {                                                  \
+    if (data_layout != DataLayout::kNHWC) {                                    \
+      KernelDepthwiseConvFilterGradSp<                                         \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          output_grad_data, input_data, batch_size, output_channels,           \
+          output_height, output_width, input_channels, input_height,           \
+          input_width, filter_multiplier, ksize_height, ksize_width,           \
+          stride_height, stride_width, padding_height, padding_width,          \
+          dilate_height, dilate_width, filter_grad_data);                      \
+    } else {                                                                   \
+      framework::Tensor filter_grad_hwc;                                       \
+      if (c_filter != -1) {                                                    \
+        framework::DDim filter_grad_hwc_dims(                                  \
+            {filter_grad->dims()[2], filter_grad->dims()[3],                   \
+             filter_grad->dims()[0], filter_grad->dims()[1]});                 \
+        filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
+        filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
+        math::SetConstant<platform::CUDADeviceContext, T> set_zero;            \
+        set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
+        filter_grad_data = filter_grad_hwc.data<T>();                          \
+      } else {                                                                 \
+        block_size = 512;                                                      \
+        if (output_channels > 1024 && output_channels <= 2048) {               \
+          block_size = (output_channels - 1) / 2 + 1;                          \
+        } else if (output_channels > 512 && output_channels <= 1024) {         \
+          block_size = output_channels;                                        \
+        }                                                                      \
+        blocks =                                                               \
+            std::min(std::max(block_size / output_channels, 1), output_width); \
+        grid = dim3(ksize_width * ksize_height, output_height, batch_size);    \
+        threads = dim3(std::min(output_channels, block_size), blocks, 1);      \
+      }                                                                        \
+      KernelDepthwiseConvFilterGradSp<                                         \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          output_grad_data, input_data, batch_size, output_channels,           \
+          output_height, output_width, input_channels, input_height,           \
+          input_width, filter_multiplier, ksize_height, ksize_width,           \
+          stride_height, stride_width, padding_height, padding_width,          \
+          dilate_height, dilate_width, filter_grad_data);                      \
+      if (c_filter != -1) {                                                    \
+        std::vector<int> perm_axis({2, 3, 0, 1});                              \
+        math::TransposeNormal<platform::CUDADeviceContext, T> trans;           \
+        trans(context, filter_grad_hwc, filter_grad, perm_axis);               \
+      }                                                                        \
+    }                                                                          \
+    return;                                                                    \
   }
-    check_case(1);
-    check_case(0);
+    check_case(1, 1, 3);
+    check_case(1, 1, 5);
+    check_case(1, 1, -1);
+    check_case(1, 2, 3);
+    check_case(1, 2, 5);
+    check_case(1, 2, -1);
+    check_case(2, 1, 3);
+    check_case(2, 1, 5);
+    check_case(2, 1, -1);
+    check_case(2, 2, 3);
+    check_case(2, 2, 5);
+    check_case(2, 2, -1);
+    check_case(0, 0, -1);
 #undef check_case
   }
 };
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index a61b50faa757cfdbb09c6b4f88c28b9480e8f7ee..0bdc7b69434221ffd91b0df94287df0eae42d89b 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -49,8 +50,10 @@ template struct SetConstant<platform::CPUDeviceContext, platform::complex128>;
 
 #ifdef PADDLE_WITH_XPU
 template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::XPUDeviceContext, float>;
 template struct SetConstant<platform::XPUDeviceContext, double>;
+template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
 template struct SetConstant<platform::XPUDeviceContext, int>;
 template struct SetConstant<platform::XPUDeviceContext, int64_t>;
 template struct SetConstant<platform::XPUDeviceContext, bool>;
@@ -148,6 +151,13 @@ void set_constant_with_place<platform::XPUPlace>(
   PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
 }
 
+template <>
+void set_constant_with_place<platform::NPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
 template <>
 void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index cc8925fcf8aeeeb7e7f618336241abc3772778e1..f94c1bf696cdad5727fcf9ae659c1430b0f8bef4 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -33,8 +34,10 @@ using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
+template struct SetConstant<platform::CUDADeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
 template struct SetConstant<platform::CUDADeviceContext, double>;
+template struct SetConstant<platform::CUDADeviceContext, uint8_t>;
 template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc
index 25bc5d725e1fd0b3d896ec44a603ce2834b70b44..60481491cb4b406809993c69503bff90bada8092 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cc
@@ -23,34 +23,10 @@ namespace math {
 
 template <typename T>
 class MatrixInverseFunctor<platform::CPUDeviceContext, T> {
-  using Matrix =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using EigenMatrixMap = Eigen::Map<Matrix>;
-  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
-
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& a, framework::Tensor* a_inv) {
-    const auto& mat_dims = a.dims();
-    const int rank = mat_dims.size();
-    int n = mat_dims[rank - 1];
-    int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
-
-    const T* a_ptr = a.data<T>();
-    T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
-
-    for (int i = 0; i < batch_size; ++i) {
-      ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
-      EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
-      Eigen::PartialPivLU<Matrix> lu;
-      lu.compute(mat);
-
-      const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
-      PADDLE_ENFORCE_GT(
-          min_abs_pivot, static_cast<T>(0),
-          platform::errors::InvalidArgument("Input is not invertible."));
-      mat_inv.noalias() = lu.inverse();
-    }
+    compute_inverse_eigen<platform::CPUDeviceContext, T>(context, a, a_inv);
   }
 };
 
diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc
index 7f5df114680552b96053418aba7ba6cf3f3a75a3..5deedf084c6970fb15f1d6c1eb5ab8457aff6b7f 100644
--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
@@ -32,6 +33,7 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& a, framework::Tensor* a_inv) {
+#ifndef PADDLE_WITH_HIP
     const auto& mat_dims = a.dims();
     const int rank = mat_dims.size();
     int n = mat_dims[rank - 1];
@@ -111,6 +113,9 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
                             "non-singular matrix",
                             i, info[i], info[i]));
     }
+#else
+    compute_inverse_eigen<platform::CUDADeviceContext, T>(context, a, a_inv);
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/fluid/operators/math/matrix_inverse.h
index f0baf0b250e757315da6e45cdb9d92c1336e9c26..fb58b483666526c0b7e745d1e49e308235871d57 100644
--- a/paddle/fluid/operators/math/matrix_inverse.h
+++ b/paddle/fluid/operators/math/matrix_inverse.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <string>
+#include "Eigen/Core"
+#include "Eigen/LU"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -22,6 +24,36 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+template <typename DeviceContext, typename T>
+void compute_inverse_eigen(const DeviceContext& context,
+                           const framework::Tensor& a,
+                           framework::Tensor* a_inv) {
+  using Matrix =
+      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using EigenMatrixMap = Eigen::Map<Matrix>;
+  using ConstEigenMatrixMap = Eigen::Map<const Matrix>;
+  const auto& mat_dims = a.dims();
+  const int rank = mat_dims.size();
+  int n = mat_dims[rank - 1];
+  int batch_size = rank > 2 ? a.numel() / (n * n) : 1;
+
+  const T* a_ptr = a.data<T>();
+  T* a_inv_ptr = a_inv->mutable_data<T>(context.GetPlace());
+
+  for (int i = 0; i < batch_size; ++i) {
+    ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n);
+    EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n);
+    Eigen::PartialPivLU<Matrix> lu;
+    lu.compute(mat);
+
+    const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff();
+    PADDLE_ENFORCE_GT(
+        min_abs_pivot, static_cast<T>(0),
+        platform::errors::InvalidArgument("Input is not invertible."));
+    mat_inv.noalias() = lu.inverse();
+  }
+}
+
 template <typename DeviceContext, typename T>
 class MatrixInverseFunctor {
  public:
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 21d588cc01f322b44c029d7cc9f95b8f5262a864..3547de0a4d7b7f7e5974bfd733a36deb561c10ba 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/macros.h"
@@ -46,10 +47,22 @@ class MaxPool {
 
 template <class T>
 class AvgPool {
+  using MT = typename details::MPTypeTrait<T>::Type;
+  MT intermediate_res;
+
  public:
-  DEVICE inline T initial() { return static_cast<T>(0); }
-  DEVICE inline void compute(const T& x, T* y) { *y += x; }
-  DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; }
+  DEVICE inline T initial() {
+    intermediate_res = static_cast<MT>(0.0f);
+    return static_cast<T>(0);
+  }
+
+  DEVICE inline void compute(const T& x, T* y) {
+    intermediate_res += static_cast<MT>(x);
+  }
+
+  DEVICE inline void finalize(const T& pool_field, T* y) {
+    *y = static_cast<T>(intermediate_res / (static_cast<MT>(pool_field)));
+  }
 };
 
 template <class T>
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index 0b615cefac4eed2b2d972d5ed4b0e3a728d55486..b49b5036ac42e2359a2840f48ab0a42ced6bc406 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -25,14 +25,12 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentMeanCustomKernel(
-    const Index* segment_ids, const T* input, T* output, T* summed_ids,
-    const Index input_length_size, const Index inner_dim_size,
-    const Index output_length_size, const Index total_stripe_count) {
+__global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
+                                    const Index input_length_size,
+                                    const Index total_stripe_count) {
   CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
-    const Index segment_offset = stripe_index % inner_dim_size;
-    const Index dim_index_base =
-        stripe_index / inner_dim_size * Index(DimTileSize);
+    const Index segment_offset = stripe_index;
+    const Index dim_index_base = stripe_index * Index(DimTileSize);
     const Index actual_height =
         min(Index(DimTileSize), input_length_size - dim_index_base);
 
@@ -41,19 +39,20 @@ __global__ void SegmentMeanCustomKernel(
     if (dim_index_base > 0) {
       last_segment_id = segment_ids[dim_index_base - 1];
     }
-    if (segment_offset == 0) {
-      T sum = T(0);
-      for (Index j = 0; j < actual_height; j++) {
-        Index current_segment_id = segment_ids[dim_index_base + j];
-        // Note(ZHUI): following check may cause
-        // cudaErrorLaunchOutOfResources.
-        // PADDLE_ENFORCE(current_segment_id >= last_segment_id,
-        //               "the segment ids should be sorted, but got "
-        //               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-        //               dim_index_base + j - 1, dim_index_base + j,
-        //               last_segment_id, current_segment_id);
-
-        if (j > 0 && current_segment_id > last_segment_id) {
+    T sum = T(0);
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      PADDLE_ENFORCE(current_segment_id >= last_segment_id,
+                     "the segment ids should be sorted, but got "
+                     "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                     dim_index_base + j - 1, dim_index_base + j,
+                     last_segment_id, current_segment_id);
+      if (current_segment_id > last_segment_id) {
+        for (Index interval_id = last_segment_id + 1;
+             interval_id < current_segment_id; ++interval_id) {
+          *(summed_ids + interval_id) = 0;
+        }
+        if (j > 0) {
           if (last_segment_id == first_segment_id) {
             platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
@@ -61,33 +60,60 @@ __global__ void SegmentMeanCustomKernel(
           }
           sum = T(0);
         }
-        sum += T(1);
-        last_segment_id = current_segment_id;
       }
-      platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+      sum += T(1);
+      last_segment_id = current_segment_id;
+    }
+    platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+  }
+}
+
+template <typename T, typename Index, int DimTileSize>
+__global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
+                                  T* output, T* summed_ids,
+                                  const Index input_length_size,
+                                  const Index inner_dim_size,
+                                  const Index output_length_size,
+                                  const Index total_stripe_count) {
+  CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
+    const Index segment_offset = stripe_index % inner_dim_size;
+    const Index dim_index_base =
+        stripe_index / inner_dim_size * Index(DimTileSize);
+    const Index actual_height =
+        min(Index(DimTileSize), input_length_size - dim_index_base);
+
+    Index first_segment_id = segment_ids[dim_index_base];
+    Index last_segment_id = -1;
+    if (dim_index_base > 0) {
+      last_segment_id = segment_ids[dim_index_base - 1];
     }
-    // ensure last_segment_id is the largest
-    last_segment_id = output_length_size;
-    __syncthreads();
     T sum = T(0);
     for (Index j = 0; j < actual_height; j++) {
       Index current_segment_id = segment_ids[dim_index_base + j];
       if (current_segment_id > last_segment_id) {
-        const Index output_index =
-            last_segment_id * inner_dim_size + segment_offset;
-        if (last_segment_id == first_segment_id) {
-          platform::CudaAtomicAdd(output + output_index,
-                                  sum / *(summed_ids + last_segment_id));
-        } else {
-          *(output + output_index) = sum / *(summed_ids + last_segment_id);
+        // reset the interval value which do not have corresponding ids.
+        for (Index interval_id = last_segment_id + 1;
+             interval_id < current_segment_id; ++interval_id) {
+          *(output + interval_id * inner_dim_size + segment_offset) = T(0);
+        }
+
+        if (j > 0) {
+          Index output_index =
+              last_segment_id * inner_dim_size + segment_offset;
+
+          if (last_segment_id == first_segment_id) {
+            platform::CudaAtomicAdd(output + output_index,
+                                    sum / *(summed_ids + last_segment_id));
+          } else {
+            *(output + output_index) = sum / *(summed_ids + last_segment_id);
+          }
+          sum = T(0);
         }
-        sum = T(0);
       }
       sum += input[(dim_index_base + j) * inner_dim_size + segment_offset];
       last_segment_id = current_segment_id;
     }
-    const Index output_index =
-        last_segment_id * inner_dim_size + segment_offset;
+    Index output_index = last_segment_id * inner_dim_size + segment_offset;
     platform::CudaAtomicAdd(output + output_index,
                             sum / *(summed_ids + last_segment_id));
   }
@@ -122,7 +148,7 @@ __global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
              interval_id < current_segment_id; ++interval_id) {
-          *(output + interval_id * inner_dim_size + segment_offset) = 0;
+          *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
         // don't update result when j=0
         if (j > 0) {
@@ -272,11 +298,25 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
                   framework::Tensor* output,
                   framework::Tensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM") {
+    if (pooltype == "MEAN") {
+      // Sum the segment id num first
+      T DimTileSize = 8;
+      auto input_length_size = segment_ids.numel();
+      auto total_stripe_count =
+          (input_length_size + DimTileSize - 1) / DimTileSize;
+      auto config = platform::GetGpuLaunchConfig1D(ctx, total_stripe_count);
+      SegmentSumIdsKernel<
+          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                  config.thread_per_block.x, 0, ctx.stream()>>>(
+          segment_ids.data<IndexT>(), summed_ids->data<T>(), input_length_size,
+          total_stripe_count);
+    }
+
     auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
                                    output->dims()[0]);
     auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
     if (pooltype == "MEAN") {
-      SegmentMeanCustomKernel<
+      SegmentMeanKernel<
           T, IndexT, IndexT(8)><<<config.block_per_grid.x,
                                   config.thread_per_block.x, 0, ctx.stream()>>>(
           segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index f7b16453e0133b060e5040c1130c0a3bca556568..b9a1854a66118eac0a970117b92a4c80241fd7ba 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -285,6 +285,8 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
+                                        platform::bfloat16>;
 
 // This is a separated namespace for manipulate SelectedRows typed
 // data. Like merge duplicated rows, adding two SelectedRows etc.
@@ -294,21 +296,17 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 // add or mul.
 namespace scatter {
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
-                   size_t data_len, const T* in, T* out) {
-  blas->AXPY(data_len, 1., in, out);
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value>::type
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
+                   const T* in, T* out) {
+  blas->AXPY(data_len, T(1.f), in, out);
 }
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    !std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
-                   size_t data_len, const T* in, T* out) {
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
+    BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in,
+    T* out) {
   for (size_t i = 0; i < data_len; i++) {
     out[i] += in[i];
   }
@@ -412,7 +410,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
       out.set_rows(merge_rows);
 
       math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-      constant_functor(context, out.mutable_value(), 0.0);
+      constant_functor(context, out.mutable_value(), static_cast<T>(0.f));
 
       std::unordered_map<int64_t, size_t> rows_to_id;
       for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -429,9 +427,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
 
         for (size_t i = 0; i < input_rows.size(); i++) {
           size_t out_i = rows_to_id[input_rows[i]];
-          elementwise_add_to<platform::CPUDeviceContext, T>(
-              context, &blas, static_cast<size_t>(input_width),
-              &input_data[i * input_width], &out_data[out_i * input_width]);
+          elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
+                                &input_data[i * input_width],
+                                &out_data[out_i * input_width]);
         }
       }
     }
@@ -524,9 +522,9 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
 
       for (size_t i = 0; i < input_rows.size(); i++) {
         size_t out_i = rows_to_id[input_rows[i]];
-        elementwise_add_to<platform::CPUDeviceContext, T>(
-            context, &blas, static_cast<size_t>(input_width),
-            &input_data[i * input_width], &out_data[out_i * input_width]);
+        elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
+                              &input_data[i * input_width],
+                              &out_data[out_i * input_width]);
       }
     }
     size_t input_width_cast = static_cast<size_t>(input_width);
@@ -547,6 +545,8 @@ template struct MergeAdd<platform::CPUDeviceContext,
                          paddle::platform::complex64>;
 template struct MergeAdd<platform::CPUDeviceContext,
                          paddle::platform::complex128>;
+template struct MergeAdd<platform::CPUDeviceContext,
+                         paddle::platform::bfloat16>;
 
 template struct MergeAverage<platform::CPUDeviceContext, int>;
 template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 879e367281c0a359288a440c0107a47b1bfda95e..9e9fe5b9c1020db9800b187c7d7fc8b156f66b21 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -54,10 +54,11 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
-      Y->mutable_data<T>(context.GetPlace())));
+      Y->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
   cudnnTensorDescriptor_t cudnn_x_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -96,11 +97,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
       CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
-      XGrad->mutable_data<T>(context.GetPlace())));
+      XGrad->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index d78e3385efb29cbba540d50433bf0fe35cedd448..a73f76f53be052f1d884538f70810be76cacc0bc 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -87,7 +87,11 @@ class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
+#ifdef __HIPCC__
+    int threads = 256;
+#else
     int threads = 1024;
+#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
         input.numel(), input_data, indices_data, input_height, input_width,
@@ -117,7 +121,11 @@ class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+#ifdef __HIPCC__
+    int threads = 256;
+#else
     int threads = 1024;
+#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
         input.numel(), input_data, indices_data, input_height, input_width,
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 9b64e99c944725d9910139be9ecadec83e5ceb7e..c12aecc9ba5160b532c5bb35e2564209946b7f42 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -587,7 +587,7 @@ class MatMulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_,
                       platform::errors::InvalidArgument(
                           "Input X's width should be equal to the Y's height, "
-                          "but received X's shape: [%s],"
+                          "but received X's shape: [%s], "
                           "Y's shape: [%s].",
                           dim_x, dim_y));
 #endif
diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc
index f92cff2f6cd216493b12834bfd1744bb57e21460..6fa96aca4be147e9d70c6e62500acaae88822315 100644
--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-
 using framework::Tensor;
 
 static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) {
@@ -123,34 +122,47 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
       mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_;
     }
   }
-  PADDLE_ENFORCE_EQ(
-      mat_dim_a.width_, mat_dim_b.height_,
-      platform::errors::InvalidArgument("Shape mistake in matmul_op, the "
-                                        "first tensor width must be same as "
-                                        "second tensor height, but received "
-                                        "width:%d, height:%d",
-                                        mat_dim_a.width_, mat_dim_b.height_));
+
+  if (mat_dim_a.width_ == mat_dim_b.height_) {
+    if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) {
+      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
+    }
+    if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) {
+      mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0;
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_,
+                    platform::errors::InvalidArgument(
+                        "Shape mistake in matmul_op, the "
+                        "first tensor width must be same as "
+                        "second tensor height, but received "
+                        "width:%d, height:%d x_dims = %s , y_dims = %s",
+                        mat_dim_a.width_, mat_dim_b.height_,
+                        x_dims.to_str().c_str(), y_dims.to_str().c_str()));
   PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_,
                     platform::errors::InvalidArgument(
                         "Shape mistake in matmul_op, the two input"
                         "tensor batch_size must be same, but received first "
                         "tensor batch_size:%d, second "
-                        "tensor batch_size:%d",
-                        mat_dim_a.batch_size_, mat_dim_b.batch_size_));
+                        "tensor batch_size:%d, x_dims = %s , y_dims = %s",
+                        mat_dim_a.batch_size_, mat_dim_b.batch_size_,
+                        x_dims.to_str().c_str(), y_dims.to_str().c_str()));
 
-  T alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+  float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
 
-  float *data_c = out->data<T>();
+  T *data_c = out->data<T>();
   int m = mat_dim_a.height_;
   int n = mat_dim_b.width_;
   int k = mat_dim_a.width_;
+  int batch_size = mat_dim_a.batch_size_;
+
   int ldx = mat_dim_a.trans_ ? m : k;
   int ldy = mat_dim_b.trans_ ? k : n;
   int ldout = n;
-  int batch_size = mat_dim_a.batch_size_;
-
-  if (batch_size == 0) {
-    int r = xpu::fc_fusion<float, float, float, FCT>(
+  if (batch_size <= 1) {
+    int r = 0;
+    r = xpu::fc_fusion<T, T, T, FCT>(
         dev_ctx.x_context(), x->data<T>(), y->data<T>(), data_c, m, n, k,
         mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy,
         ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR);
@@ -159,14 +171,32 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
                           "XPU fc_fusion kernel return wrong value[%d %s]", r,
                           XPUAPIErrorMsg[r]));
   } else {
-    int r = xpu::fc_batched<float, float, float, FCT>(
-        dev_ctx.x_context(), batch_size, mat_dim_a.trans_, mat_dim_b.trans_, m,
-        n, k, alpha, x->data<T>(), mat_dim_a.stride_, y->data<T>(),
-        mat_dim_b.stride_, 0.0, data_c, m * n, nullptr, nullptr);
+    // batch matmul
+    int r = xpu::fc_batched<T, T, T, FCT>(
+        dev_ctx.x_context(),                        // Context* ctx,
+        batch_size,                                 // int batch_size,
+        mat_dim_a.trans_,                           // bool x_trans,
+        mat_dim_b.trans_,                           // bool w_trans,
+        m,                                          // int m,
+        n,                                          // int n,
+        k,                                          // int k,
+        alpha,                                      // float alpha,
+        reinterpret_cast<const T *>(x->data<T>()),  // const TX* x,
+        mat_dim_a.stride_,                          // int stride_a,
+        reinterpret_cast<const T *>(y->data<T>()),  // const TW* w,
+        mat_dim_b.stride_,                          // int stride_b,
+        0.0,                                        // float beta,
+        reinterpret_cast<T *>(data_c),              // TY* y,
+        m * n,                                      // int stride_c,
+        nullptr,                                    // const float* x_maxptr,
+        nullptr);                                   // const float* w_maxptr
+
     PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                       platform::errors::External(
-                          "XPU fc_batched kernel return wrong value[%d %s]", r,
-                          XPUAPIErrorMsg[r]));
+                          "XPU fc_batched kernel return wrong value[%d %s] "
+                          "x_dims = %s , y_dims = %s",
+                          r, XPUAPIErrorMsg[r], x_dims.to_str().c_str(),
+                          y_dims.to_str().c_str()));
   }
 }
 
@@ -206,9 +236,8 @@ static framework::Tensor XPUFoldHeadAndLastDims(
                                     static_cast<int>(in_dims[1]),
                                     static_cast<int>(in_dims[2])};
   std::vector<int> axis_host = {1, 0, 2};
-
   int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
-                         in_shape_host.data(), axis_host.data(), /*ndims=*/3);
+                         in_shape_host, axis_host);
   PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                     platform::errors::External(
                         "XPU transpose kernel return wrong value[%d %s]", r,
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d3022056a47ded99e63aa05c1aca8e9b31ccc3fe
--- /dev/null
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/matmul_v2_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MatMulV2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    bool transpose_x = ctx.Attr<bool>("trans_x");
+    bool transpose_y = ctx.Attr<bool>("trans_y");
+
+    if (x->dims().size() == 2) {
+      out->mutable_data<T>(ctx.GetPlace());
+
+      auto runner = NpuOpRunner(
+          "MatMul", {*x, *y}, {*out},
+          {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
+
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+
+    } else if (x->dims().size() > 2) {
+      out->mutable_data<T>(ctx.GetPlace());
+
+      auto runner =
+          NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
+                      {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
+
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_y = ctx.Attr<bool>("trans_y");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (x->dims().size() == 2) {
+      if (transpose_y) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", false}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {*dout, *x}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+
+      } else {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      }
+    } else if (x->dims().size() > 2) {
+      if (transpose_y) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                                       {{"adj_x1", false}, {"adj_x2", false}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy = NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
+                                       {{"adj_x1", true}, {"adj_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      } else {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                                       {{"adj_x1", false}, {"adj_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy = NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
+                                       {{"adj_x1", true}, {"adj_x2", false}});
+          runner_dy.Run(stream);
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    matmul_v2,
+    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    matmul_v2_grad,
+    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc
index dbb1d7bfb0a3d9fb2d8727b48061da954728da01..d992ef847db2aca8bc284781fdd1408d36bd14e5 100644
--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -57,32 +57,55 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
 
   PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_,
                     platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s",
-                        x_dims.to_str(), y_dims.to_str()));
+                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s "
+                        "x_trans = %d y_trans = %d",
+                        x_dims.to_str(), y_dims.to_str(), mat_dim_a.trans_,
+                        mat_dim_b.trans_));
   PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_,
                     platform::errors::InvalidArgument(
-                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s",
-                        x_dims.to_str(), y_dims.to_str()));
+                        "Shape mistake in matmul_v2_op xdims = %s ydims = %s "
+                        "x_trans = %d y_trans = %d",
+                        x_dims.to_str(), y_dims.to_str(), mat_dim_a.trans_,
+                        mat_dim_b.trans_));
 
-  float* data_c = out->data<T>();
+  T* data_c = out->data<T>();
   int m = mat_dim_a.height_;
   int n = mat_dim_b.width_;
   int k = mat_dim_a.width_;
   int batch_size = mat_dim_a.batch_size_;
-
-  if (batch_size == 0) {
-    int r = xpu::fc<float, float, float, FCT>(
-        dev_ctx.x_context(), x->data<T>(), y->data<T>(), data_c, m, n, k,
-        mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr);
-    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                      platform::errors::External(
-                          "XPU fc_fusion kernel return wrong value[%d %s]", r,
-                          XPUAPIErrorMsg[r]));
+  if (batch_size <= 1) {
+    int r = 0;
+    r = xpu::fc<T, T, T, FCT>(dev_ctx.x_context(), x->data<T>(), y->data<T>(),
+                              data_c, m, n, k, mat_dim_a.trans_,
+                              mat_dim_b.trans_, nullptr, nullptr, nullptr);
+    PADDLE_ENFORCE_EQ(
+        r, XPU_SUCCESS,
+        platform::errors::External(
+            "XPU fc_fusion kernel return wrong value[%d %s] , m = %d, n = "
+            "%d, "
+            "k = %d, a_tr = %d, b_tr = %d",
+            r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_));
   } else {
-    int r = xpu::fc_batched<float, float, float, FCT>(
-        dev_ctx.x_context(), batch_size, mat_dim_a.trans_, mat_dim_b.trans_, m,
-        n, k, 1.0, x->data<T>(), mat_dim_a.stride_, y->data<T>(),
-        mat_dim_b.stride_, 0.0, data_c, m * n, nullptr, nullptr);
+    // batch matmul
+    int r = xpu::fc_batched<T, T, T, FCT>(
+        dev_ctx.x_context(),                       // Context* ctx,
+        batch_size,                                // int batch_size,
+        mat_dim_a.trans_,                          // bool x_trans,
+        mat_dim_b.trans_,                          // bool w_trans,
+        m,                                         // int m,
+        n,                                         // int n,
+        k,                                         // int k,
+        1.0,                                       // float alpha,
+        reinterpret_cast<const T*>(x->data<T>()),  // const TX* x,
+        mat_dim_a.stride_,                         // int stride_a,
+        reinterpret_cast<const T*>(y->data<T>()),  // const TW* w,
+        mat_dim_b.stride_,                         // int stride_b,
+        0.0,                                       // float beta,
+        reinterpret_cast<T*>(data_c),              // TY* y,
+        m * n,                                     // int stride_c,
+        nullptr,                                   // const float* x_maxptr,
+        nullptr);                                  // const float* w_maxptr
+
     PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                       platform::errors::External(
                           "XPU fc_batched kernel return wrong value[%d %s]", r,
@@ -125,7 +148,7 @@ static framework::Tensor XPUFoldHeadAndLastDims(
   std::vector<int> axis_host = {1, 0, 2};
 
   int r = xpu::transpose(context.x_context(), input.data<T>(), output.data<T>(),
-                         in_shape_host.data(), axis_host.data(), /*ndims=*/3);
+                         in_shape_host, axis_host);
   PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                     platform::errors::External(
                         "XPU transpose kernel return wrong value[%d %s]", r,
@@ -189,6 +212,7 @@ class MatMulV2XPUGradKernel : public framework::OpKernel<T> {
     auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
     auto* dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
     ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y);
+
     framework::DDim dx_dims;
     if (dx) {
       dx_dims = dx->dims();
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6e982039fa290ae9095fe380fa22955c6acde70
--- /dev/null
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MeanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    std::vector<int> axes;
+
+    framework::NPUAttributeMap attr_input = {{"keep_dims", false},
+                                             {"axes", axes}};
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MeanGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto grad = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(grad->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "Mean Gradient Input Tensor len should be 1. But "
+                          "received Out@Grad's elements num is %d.",
+                          grad->numel()));
+
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    IG->mutable_data<T>(context.GetPlace());
+
+    // ones
+    Tensor ones(grad->type());
+    ones.mutable_data<T>(IG->dims(), context.GetPlace());
+    auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
+    runner_ones.Run(stream);
+
+    // means
+    Tensor mean_tensor(grad->type());
+    mean_tensor.Resize({1});
+    mean_tensor.mutable_data<T>(context.GetPlace());
+    FillNpuTensorWithConstant<T>(
+        &mean_tensor, static_cast<T>(1.0 / static_cast<float>(IG->numel())));
+
+    // means mul ones
+    Tensor mean_ma(grad->type());
+    mean_ma.Resize(IG->dims());
+    mean_ma.mutable_data<T>(context.GetPlace());
+    auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
+    runner_mul_1.Run(stream);
+
+    // and mul grad
+    auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
+    runner_mul_2.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
+
+REGISTER_OP_NPU_KERNEL(
+    mean_grad, ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc
index d10d5bf12e6b4acb44a81c0e50b382348d2f4406..ecd2d48dcbd102baffaccfd5de369462b5f8e527 100644
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -105,16 +105,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "is the same as input X.");
     AddAttr<int>("dst_place_type",
                  "Determine the dst place of tensor copy. "
-                 "By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other "
-                 "place type is Unimplemented and will cause ERROR."
+                 "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or "
+                 "NPUPlace <-> CPUPlace. "
+                 "Other place type is Unimplemented and will cause ERROR."
                  "0: dst is on CPUPlace. "
                  "1: dst is on CUDAPlace. "
                  "2: dst is on CUDAPinnedPlace. "
-                 "3: dst is on XPUPlace. ");
+                 "3: dst is on XPUPlace. "
+                 "4: dst is on NPUPlace. ");
     AddComment(R"DOC(
     Memcpy Operator.
-    By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace,
-    and used as an internal op by Recompute-Offload.
+    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or 
+    NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
     You would have to update it if you want other more capacities.
 
 Out = X,  when type in [LoDTensor]
@@ -139,10 +141,18 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
                                ops::MemcpyKernel, plat::float16,
                                ops::MemcpyKernel);
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
                                 ops::MemcpyKernel, int, ops::MemcpyKernel,
                                 int64_t, ops::MemcpyKernel, bool,
                                 ops::MemcpyKernel, plat::float16,
                                 ops::MemcpyKernel);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
+                               ops::MemcpyKernel, int, ops::MemcpyKernel,
+                               int64_t, ops::MemcpyKernel, bool,
+                               ops::MemcpyKernel, plat::float16,
+                               ops::MemcpyKernel);
+#endif
diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h
old mode 100755
new mode 100644
index f81ca05f4380a407931351b5421b9d38d9d81057..63a41cc7237310541fbe7bc931a7a03145842c03
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -51,7 +51,17 @@ class MemcpyFunctor {
     } else if (dst_place_type_ == 1) {
       framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                             &out_tensor);
-    } else {
+    }
+#ifdef PADDLE_WITH_ASCEND_CL
+    else if (dst_place_type_ == 0) {  // NOLINT
+      framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_,
+                            &out_tensor);
+    } else if (dst_place_type_ == 4) {
+      framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
+                            &out_tensor);
+    }
+#endif
+    else {  // NOLINT
       PADDLE_THROW(platform::errors::Unimplemented(
           "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
     }
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 12b255329da2d5875dbdf338ed83d3682a0bd8b2..54600e26bb57f359d1fc6807c6c4808b13acf378 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -108,7 +108,9 @@ class MeshgridGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Out")).size(), 1,
                       platform::errors::InvalidArgument(
-                          "Number of Inputs(Out@Grad) must be larger than 1"));
+                          "Number of Inputs(Out@Grad) should be larger than 1."
+                          "But received Inputs(Out@Grad)' size = %d .",
+                          ctx->Inputs(framework::GradVarName("Out")).size()));
     ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
   }
 
@@ -155,3 +157,17 @@ REGISTER_OP_CPU_KERNEL(
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    meshgrid, ops::MeshgridKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    meshgrid_grad,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/meshgrid_op.cu b/paddle/fluid/operators/meshgrid_op.cu
deleted file mode 100644
index dc813a07f8c8c17e6c9b967a4fad372513d61594..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/meshgrid_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/meshgrid_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    meshgrid, ops::MeshgridKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    meshgrid_grad,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h
old mode 100644
new mode 100755
index 11cd43b22045c3b04d5495901be50845cccae206..2aad894e11d4b43b5121d18bf431c0195586926e
--- a/paddle/fluid/operators/meshgrid_op.h
+++ b/paddle/fluid/operators/meshgrid_op.h
@@ -25,10 +25,18 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/errors.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define MESHGRID_TEMPLATE(z, n, data) \
   case n + 1: {                       \
     MeshgridForward<n + 1>(context);  \
@@ -37,10 +45,10 @@
 #define REP_MESHGRID_TEMPLATE(n) BOOST_PP_REPEAT(n, MESHGRID_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
 
-#define MESHGRID_GRAD_CASE(n)     \
-  case n: {                       \
-    MeshgridBackward<n>(context); \
-    break;                        \
+#define MESHGRID_GRAD_CASE(n)         \
+  case n + 1: {                       \
+    MeshgridBackward<n + 1>(context); \
+    break;                            \
   }
 #define MESHGRID_GRAD_TEMPLATE(z, n, data) \
   BOOST_PP_IF(COND(n), MESHGRID_GRAD_CASE(n), )
@@ -60,7 +68,8 @@ class MeshgridKernel : public framework::OpKernel<T> {
       REP_MESHGRID_TEMPLATE(MAX_RANK_SUPPORTED)
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "Only support tensor nums between 1 and 6."));
+            "Excepted Tensor numbers between 1 and 6, but only received d% .",
+            rank));
     }
   }
 
@@ -71,7 +80,9 @@ class MeshgridKernel : public framework::OpKernel<T> {
     auto outs = context.MultiOutput<framework::Tensor>("Out");
     PADDLE_ENFORCE_EQ(
         ins.size() > 1, true,
-        platform::errors::InvalidArgument("expect at least 2 input tensors"));
+        platform::errors::InvalidArgument(
+            "Expected at least 2 input tensors, but only received d%.",
+            ins.size()));
 
     int64_t size = ins.size();
     std::vector<int64_t> shape(size);
@@ -103,19 +114,21 @@ class MeshgridKernel : public framework::OpKernel<T> {
       reshape_ins_tensor.Resize(out_dims_reshape);
       framework::DDim out_dims = framework::make_ddim(shape);
 
-      Eigen::DSizes<int, Rank> bcast_dims;
+      Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
       for (int64_t j = 0; j < size; j++) {
         bcast_dims[j] = shape[j];
       }
       bcast_dims[i] = 1;
 
       outs[i]->Resize(out_dims);
-      auto x = framework::EigenTensor<T, Rank>::From(reshape_ins_tensor);
+      auto x = framework::EigenTensor<T, Rank>::From(
+          static_cast<const framework::Tensor>(reshape_ins_tensor));
       outs[i]->mutable_data<T>(context.GetPlace());
       auto y = framework::EigenTensor<T, Rank>::From(*outs[i]);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -131,7 +144,8 @@ class MeshgridGradKernel : public framework::OpKernel<T> {
       REP_MESHGRID_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "only support tensor nums being between 1 and 6."));
+            "Excepted Tensor numbers between 1 and 6, but only received d% .",
+            n));
     }
   }
 
@@ -165,21 +179,20 @@ class MeshgridGradKernel : public framework::OpKernel<T> {
         }
       }
 
-      Eigen::DSizes<int, Rank> reduce_dims;
+      Eigen::DSizes<Eigen::DenseIndex, Rank> reduce_dims;
       for (int k = 0; k < n; k++) {
         reduce_dims[k] = reduce_dims_vec[k];
       }
 
-      Eigen::DSizes<int, Rank * 2> reshape_dims;
+      Eigen::DSizes<Eigen::DenseIndex, Rank * 2> reshape_dims;
       for (int k = 0; k < n * 2; k++) {
         reshape_dims[k] = reshape_dims_vec[k];
       }
 
-      auto tensor_reduce_tmp =
-          out_grad_tmp.reshape(reshape_dims).sum(reduce_dims);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      in_grad.device(place) = tensor_reduce_tmp.reshape(in_grad.dimensions());
+      EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, in_grad, out_grad_tmp, reduce_dims, reshape_dims);
     }
   }
 };
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ffcbaf55314a46888e15572e8477054b23ae2bb
--- /dev/null
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/metrics/accuracy_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AccuracyNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* pred = ctx.Input<Tensor>("Out");
+    auto* label = ctx.Input<Tensor>("Label");
+    // auto* logits = ctx.Input<Tensor>("Indices");
+
+    auto* acc = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // cast pred
+    Tensor tmp_pred(pred->type());
+    tmp_pred.Resize(pred->dims());
+    tmp_pred.mutable_data<int>(ctx.GetPlace());
+    auto runner_cast_pred =
+        NpuOpRunner("Cast", {*pred}, {tmp_pred},
+                    {{"dst_type", static_cast<int>(ACL_INT32)}});
+    runner_cast_pred.Run(stream);
+
+    // cast label
+    Tensor tmp_label(label->type());
+    tmp_label.Resize(label->dims());
+    tmp_label.mutable_data<int>(ctx.GetPlace());
+    auto runner_cast_label =
+        NpuOpRunner("Cast", {*label}, {tmp_label},
+                    {{"dst_type", static_cast<int>(ACL_INT32)}});
+    runner_cast_label.Run(stream);
+
+    // equal
+    Tensor tmp_equal(label->type());
+    tmp_equal.Resize(label->dims());
+    tmp_equal.mutable_data<bool>(ctx.GetPlace());
+    auto runner_equal =
+        NpuOpRunner("Equal", {tmp_pred, tmp_label}, {tmp_equal}, {});
+    runner_equal.Run(stream);
+
+    // cast equal
+    Tensor tmp_equal_cast(label->type());
+    tmp_equal_cast.Resize(label->dims());
+    tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
+    auto runner_cast_equal =
+        NpuOpRunner("Cast", {tmp_equal}, {tmp_equal_cast},
+                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
+    runner_cast_equal.Run(stream);
+
+    // acc
+    acc->mutable_data<float>(ctx.GetPlace());
+    std::vector<int> axes_vec_1;
+    auto runner_acc = NpuOpRunner("ReduceMeanD", {tmp_equal_cast}, {*acc},
+                                  {{"keep_dims", false}, {"axes", axes_vec_1}});
+    runner_acc.Run(stream);
+
+    // correct
+    correct->mutable_data<float>(ctx.GetPlace());
+    std::vector<int> axes_vec_2;
+    auto runner_correct =
+        NpuOpRunner("ReduceSumD", {tmp_equal_cast}, {*correct},
+                    {{"keep_dims", false}, {"axes", axes_vec_2}});
+    runner_correct.Run(stream);
+
+    // ones_tensor
+    Tensor ones_tensor(label->type());
+    ones_tensor.Resize(label->dims());
+    ones_tensor.mutable_data<int>(ctx.GetPlace());
+    auto runner_oneslike =
+        NpuOpRunner("OnesLike", {tmp_label}, {ones_tensor}, {});
+    runner_oneslike.Run(stream);
+
+    // ones_tensor_cast
+    Tensor ones_tensor_cast(label->type());
+    ones_tensor_cast.Resize(label->dims());
+    ones_tensor_cast.mutable_data<float>(ctx.GetPlace());
+    auto runner_ones_cast =
+        NpuOpRunner("Cast", {ones_tensor}, {ones_tensor_cast},
+                    {{"dst_type", static_cast<float>(ACL_FLOAT)}});
+    runner_ones_cast.Run(stream);
+
+    // total
+    total->mutable_data<float>(ctx.GetPlace());
+    std::vector<int> axes_vec_3;
+    auto runner_total =
+        NpuOpRunner("ReduceSumD", {ones_tensor_cast}, {*total},
+                    {{"keep_dims", false}, {"axes", axes_vec_3}});
+    runner_total.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    accuracy, ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>,
+    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::AccuracyNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h
index 7c0faa86be0be0c73dcf0d0ad68201c1bfba04cc..a357e6e5af6af0895e07e993bcbba98a33239978 100644
--- a/paddle/fluid/operators/miopen_lstm_cache.h
+++ b/paddle/fluid/operators/miopen_lstm_cache.h
@@ -75,10 +75,11 @@ class ScopedRNNBase {
                              dropout_state, seed_, state_size);
 
     // ------------------- miopen rnn descriptors ---------------------
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
-        rnn_desc_.desc(), hidden_size_, num_layers_, miopenRNNlinear,
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM,
-        miopenRNNNoBias, miopenRNNdefault, miopen_type));
+        miopenRNNwithBias, miopenRNNdefault, miopen_type));
 
     // ------------------- miopen weights_size ---------------------
     size_t weights_size_;
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 4beb7ad017851b513331b0e40dc5a9ee35939716..df1b5af121da939ad818d0dacfb8f62a6464cac8 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -71,6 +71,15 @@ static const std::vector<const Tensor*> ReduceMultiInput(
   return reduced;
 }
 
+static const std::vector<int> GetDimsForKey(
+    const std::vector<const Tensor*>& inputs) {
+  auto dims_key = paddle::framework::vectorize<int>(inputs[0]->dims());
+  for (auto it = std::next(inputs.begin()); it != inputs.end(); ++it) {
+    dims_key.push_back((*it)->dims()[0]);
+  }
+  return dims_key;
+}
+
 template <typename T>
 class ConcatPrimitiveFactory {
  public:
@@ -134,6 +143,8 @@ template <typename T>
 class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    // If any of the multiple inputs of concat has an input size of 0, the
+    // actual size of the multi_input will change
     auto multi_input = ReduceMultiInput(ctx.MultiInput<Tensor>("X"));
     EnforceLayouts(multi_input);
     Tensor* output = ctx.Output<Tensor>("Out");
@@ -156,12 +167,9 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         paddle::framework::ToMKLDNNDataType(multi_input[0]->type());
 
     ConcatPrimitiveFactory<T> prim_creator;
-    // If one of the multiple inputs of concat has an input size of 0, the
-    // actual size of the multi_input will change
-    std::string key = platform::CreateKey(
-        dev_ctx, paddle::framework::vectorize<int>(multi_input[0]->dims()),
-        multi_input.size(), ctx.OutputName("Out"), dt,
-        platform::ThreadIDasStr());
+    std::string key =
+        platform::CreateKey(dev_ctx, GetDimsForKey(multi_input),
+                            multi_input.size(), ctx.OutputName("Out"), dt);
     key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
 
     const std::string key_prim = key + "@concat_p";
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 64a1903c2da4ff5bc5e903ab33124d49bf1b8cdd..9d80286f4c4efa54ce83ca6148399d0875d64dc0 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -33,7 +33,7 @@ class InterpolateMKLDNNHandler
     : public platform::MKLDNNHandlerT<T, dnnl::resampling_forward> {
  public:
   InterpolateMKLDNNHandler(const dnnl::algorithm algo,
-                           const paddle::platform::MKLDNNDeviceContext& dev_ctx,
+                           const platform::MKLDNNDeviceContext& dev_ctx,
                            const dnnl::engine engine, platform::Place cpu_place,
                            const Tensor* x, Tensor* z,
                            const std::string& uniq_name)
@@ -94,19 +94,32 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
         out_dims = out_size_data;
       }
     } else {
-      float scale;
+      std::vector<float> scale;
+      scale.reserve(3);
       auto scale_tensor = ctx.Input<Tensor>("Scale");
       if (scale_tensor != nullptr) {
         auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-        scale = scale_data[0];
+        scale.resize(3, scale_data[0]);
+        std::copy(scale_data.begin(), scale_data.end(), scale.begin());
       } else {
-        scale = ctx.Attr<float>("scale");
+        std::string op_type = ctx.Type();
+
+        if (op_type.find("v2") == std::string::npos) {  // v1
+          scale.push_back(ctx.Attr<float>("scale"));
+          scale.push_back(scale[0]);
+          scale.push_back(scale[0]);
+        } else {  // v2
+          std::vector<float> scale_attr = ctx.Attr<std::vector<float>>("scale");
+          scale.resize(3, scale_attr[0]);
+          std::copy(scale_attr.begin(), scale_attr.end(), scale.begin());
+        }
       }
-      if (scale > 0) {
+      if (scale[0] > 0.0f && scale[1] > 0.0f && scale[2] > 0.0f) {
+        int j = 0;
         std::vector<int64_t> in_dhw_vec = framework::vectorize(in_dhw_dims);
         std::transform(
             in_dhw_vec.begin(), in_dhw_vec.end(), out_dims.begin(),
-            [&](int64_t i) -> int { return static_cast<int>(i * scale); });
+            [&](int64_t i) -> int { return static_cast<int>(i * scale[j++]); });
       }
     }
 
@@ -172,3 +185,8 @@ REGISTER_OP_KERNEL(nearest_interp, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>);
 REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>);
+
+REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::InterpolateMKLDNNKernel<float>);
+REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::InterpolateMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index aafff5248a0244e9090b10f6dc466c93eaa06888..d6cd76b697f5189a60d11a546abed04294f02326 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -50,7 +50,7 @@ class CacheTester {
     platform::CPUPlace place;
     onednn_dev_ctx_ =
         dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(place));
-    onednn_dev_ctx_->ResetBlobMap();
+    onednn_dev_ctx_->ResetBlobMap(nullptr);
   }
 
   bool Analyze(unsigned short int num_entries) {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 71bfacb9283850d4fb2939a6380594fcf1c0cfbb..3c85da3c52c6c99209dc758d7983174acd99c9fc 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -11,7 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/device_vector.h>
 #include <thrust/for_each.h>
+#include <thrust/host_vector.h>
 #include <thrust/tuple.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/modified_huber_loss_op.h"
diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e0736239d40f289a11a1e1fd8380fcbad904a667
--- /dev/null
+++ b/paddle/fluid/operators/mul_op_npu.cc
@@ -0,0 +1,237 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/mul_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MulNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
+      if (x->dims().size() == 2 && y->dims().size() == 2) {
+        out->mutable_data<T>(ctx.GetPlace());
+        auto runner =
+            NpuOpRunner("MatMul", {*x, *y}, {*out},
+                        {{"transpose_x1", false}, {"transpose_x2", false}});
+
+        runner.Run(stream);
+      } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+        // reshape
+        Tensor tmp_x(x->type());
+        int64_t sec_dim = x->dims()[1] * x->dims()[2];
+        int64_t first_dim = x->dims()[0];
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        tmp_x.mutable_data<T>(ctx.GetPlace());
+        framework::TensorCopy(
+            *x, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        out->mutable_data<T>(ctx.GetPlace());
+        // matmul
+        auto runner =
+            NpuOpRunner("MatMul", {tmp_x, *y}, {*out},
+                        {{"transpose_x1", false}, {"transpose_x2", false}});
+        runner.Run(stream);
+      } else {
+        PADDLE_THROW(
+            platform::errors::InvalidArgument("npu error: not suppert dims"));
+      }
+      // to do other
+    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
+      PADDLE_ENFORCE_EQ(x_num_col_dims, 2,
+                        platform::errors::InvalidArgument(
+                            "now only support x_num_col_dims == 2: but got %d",
+                            x_num_col_dims));
+      // flatten => x.shape=[6, 4]
+      Tensor tmp_x(x->type());
+      int64_t first_dim = x->dims()[0] * x->dims()[1];
+      int64_t sec_dim = x->dims()[2];
+      tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+      tmp_x.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *x, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+      tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+
+      // matmul [6,4] , [4, 5] => [6, 5]
+      Tensor tmp_matmul(x->type());
+      tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]}));
+      tmp_matmul.mutable_data<T>(ctx.GetPlace());
+
+      auto runner_matmul =
+          NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul},
+                      {{"transpose_x1", false}, {"transpose_x2", false}});
+
+      runner_matmul.Run(stream);
+      // reshape [6, 5] => [2, 3, 5]
+      (*out).Resize(
+          framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]}));
+      out->mutable_data(ctx.GetPlace(), x->type());
+      framework::TensorCopy(
+          tmp_matmul, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+      (*out).Resize(
+          framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]}));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    int x_num_col_dims = ctx.Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.Attr<int>("y_num_col_dims");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    if (x_num_col_dims == 1 && y_num_col_dims == 1) {
+      if (x->dims().size() == 2 && y->dims().size() == 2) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+        // flatten => x.shape=[6, 4]
+        // matmul
+        if (dx) {
+          // matmul [2, 5] * [12, 5] => [2, 12]
+          dx->mutable_data<T>(ctx.GetPlace());
+          auto dx_dims = dx->dims();
+          dx->Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]}));
+          auto runner_matmul =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+          runner_matmul.Run(stream);
+          // reshape [2, 12] => [2, 3, 4]
+          dx->Resize(dx_dims);
+        }
+
+        if (dy) {
+          // flatten
+          Tensor tmp_x(x->type());
+          int64_t sec_dim = x->dims()[1] * x->dims()[2];
+          int64_t first_dim = x->dims()[0];
+          tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+          tmp_x.mutable_data<T>(ctx.GetPlace());
+          framework::TensorCopy(
+              *x, ctx.GetPlace(),
+              ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+          tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+          dy->mutable_data<T>(ctx.GetPlace());
+          auto runner_dy =
+              NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      }
+    } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+      // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5]
+      PADDLE_ENFORCE_EQ(x_num_col_dims, 2,
+                        platform::errors::InvalidArgument(
+                            "now only support x_num_col_dims == 2: but got %d",
+                            x_num_col_dims));
+      // tmp_dout both used by dx and dy
+      Tensor tmp_dout(x->type());
+      int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1];
+      int64_t dout_sec_dim = dout->dims()[2];
+      tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim}));
+      tmp_dout.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *dout, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), &tmp_dout);
+      tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim}));
+
+      if (dx) {
+        // tmp_dout * y [6,5] * [4,5] => [6, 4]
+        dx->mutable_data<T>(ctx.GetPlace());
+        auto dx_dims = dx->dims();
+        dx->Resize(framework::make_ddim({dout_first_dim, y->dims()[0]}));
+        auto runner_matmul =
+            NpuOpRunner("MatMul", {tmp_dout, *y}, {*dx},
+                        {{"transpose_x1", false}, {"transpose_x2", true}});
+        runner_matmul.Run(stream);
+        // reshape [2, 12] => [2, 3, 4]
+        dx->Resize(dx_dims);
+      }
+      if (dy) {
+        // flatten x.shape [2,3,4] => [6, 4]
+        Tensor tmp_x(x->type());
+        int64_t first_dim = x->dims()[0] * x->dims()[1];
+        int64_t sec_dim = x->dims()[2];
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        tmp_x.mutable_data<T>(ctx.GetPlace());
+        framework::TensorCopy(
+            *x, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), &tmp_x);
+        tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
+        // mamtul [6,4] [6,5] =>[4,5]
+        dy->mutable_data<T>(ctx.GetPlace());
+        auto runner_dy =
+            NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy},
+                        {{"transpose_x1", true}, {"transpose_x2", false}});
+        runner_dy.Run(stream);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    mul, ops::MulNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MulNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    mul_grad, ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MulGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h
index e93d5792205900635093e5f18d715e4607f73cda..be6f4422d4ac6a475477c025c4b76eabdbf4f9e0 100644
--- a/paddle/fluid/operators/nll_loss_op.h
+++ b/paddle/fluid/operators/nll_loss_op.h
@@ -36,7 +36,10 @@ static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data,
       }
       PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true,
                         platform::errors::InvalidArgument(
-                            "label should not be out of bounds."));
+                            "Label value is out of range. "
+                            "Expected label value in range of [0, %d), but "
+                            "received value is %d.",
+                            n_classes, cur_label));
 
       const auto cur_weight =
           weight_data ? weight_data[cur_label] : static_cast<T>(1);
diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu
index 6b5c70c925843ee8002e4297c242f39b485a5fa3..4c1674ded1a44a4a7f0b0f4c3b8bca37c810ed4c 100644
--- a/paddle/fluid/operators/norm_op.cu
+++ b/paddle/fluid/operators/norm_op.cu
@@ -79,8 +79,11 @@ class NormCUDAKernel : public framework::OpKernel<T> {
     GetDims(xdim, axis, &pre, &n, &post);
 
     auto& dev_ctx = ctx.cuda_device_context();
-
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
@@ -146,7 +149,11 @@ class NormGradCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 9fcc629233891b3c94f401ba1353435152756ba0..843736833f81563af6fbda18ca684bf97116ec72 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -32,6 +32,12 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -58,12 +64,10 @@ using DataLayout = framework::DataLayout;
 //          axis=(n,h,w)))
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDX(const T *x, const T *mean,
-                                    const T *variance, const T *ddx,
-                                    const T *dy, const T *scale,
-                                    const T *ddscale, const int N, const int C,
-                                    const int sample_size, const double epsilon,
-                                    T *dx) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDX(
+    const T *x, const T *mean, const T *variance, const T *ddx, const T *dy,
+    const T *scale, const T *ddscale, const int N, const int C,
+    const int sample_size, const double epsilon, T *dx) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -160,12 +164,10 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
 //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
 //           np.mean(ddx * (x - mean), axis=(n,h,w)))
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
-                                     const T *variance, const T *ddscale,
-                                     const T *ddbias, const T *ddx,
-                                     const T *scale, const int N, const int C,
-                                     const int sample_size,
-                                     const double epsilon, T *ddy) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDDY(
+    const T *x, const T *mean, const T *variance, const T *ddscale,
+    const T *ddbias, const T *ddx, const T *scale, const int N, const int C,
+    const int sample_size, const double epsilon, T *ddy) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -238,11 +240,10 @@ __global__ void DoubleGradComputeDDY(const T *x, const T *mean,
 //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
 //            ddx
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
-                                        const T *variance, const T *ddx,
-                                        const T *dy, const int N, const int C,
-                                        const int sample_size,
-                                        const double epsilon, T *dscale) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScale(
+    const T *x, const T *mean, const T *variance, const T *ddx, const T *dy,
+    const int N, const int C, const int sample_size, const double epsilon,
+    T *dscale) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -302,7 +303,7 @@ __global__ void DoubleGradComputeDScale(const T *x, const T *mean,
 
 // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDScaleWithGlobal(
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScaleWithGlobal(
     const T *ddx, const T *variance, const T *dy, const double epsilon,
     const int N, const int C, const int sample_size, T *dscale) {
   int outer_size = C;
@@ -422,8 +423,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
   }
   const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
-
+#ifdef __HIPCC__
+  const int block = 256;
+#else
   const int block = 512;
+#endif
   int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(C, max_blocks);
@@ -532,6 +536,5 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     }
   }
 }
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..276bfa7b3281b9886c6561187c48aec4e9e847c5
--- /dev/null
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -0,0 +1,306 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+#include <paddle/fluid/framework/data_type.h>
+#include <paddle/fluid/framework/operator.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+#include "acl/acl_op_compiler.h"
+
+#include "paddle/fluid/framework/framework.pb.h"
+
+namespace paddle {
+namespace operators {
+
+static std::map<framework::proto::VarType::Type, aclDataType>
+    DTYPE_2_ACL_DTYPE = {
+        {framework::proto::VarType::BOOL, ACL_BOOL},
+        {framework::proto::VarType::INT16, ACL_INT16},
+        {framework::proto::VarType::INT32, ACL_INT32},
+        {framework::proto::VarType::INT64, ACL_INT64},
+        {framework::proto::VarType::FP16, ACL_FLOAT16},
+        {framework::proto::VarType::FP32, ACL_FLOAT},
+        {framework::proto::VarType::FP64, ACL_DOUBLE},
+};
+
+static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
+    {DataLayout::kNCHW, ACL_FORMAT_NCHW},
+    {DataLayout::kNHWC, ACL_FORMAT_NHWC},
+    {DataLayout::kAnyLayout, ACL_FORMAT_ND},
+};
+
+aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype) {
+  auto iter = DTYPE_2_ACL_DTYPE.find(dtype);
+  PADDLE_ENFORCE_NE(iter, DTYPE_2_ACL_DTYPE.end(),
+                    platform::errors::NotFound(
+                        "The data type (%s) can not convert to ACL data type.",
+                        framework::DataTypeToString(dtype)));
+  return iter->second;
+}
+
+aclFormat ConvertToNpuFormat(DataLayout layout) {
+  auto iter = DATA_LAYOUT_2_ACL_FORMAT.find(layout);
+  PADDLE_ENFORCE_NE(
+      iter, DATA_LAYOUT_2_ACL_FORMAT.end(),
+      platform::errors::NotFound(
+          "The data type (%s) can not convert to ACL data type.", layout));
+  return iter->second;
+}
+
+aclrtStream GetCurrentNPUStream(int device_id) {
+  if (device_id == -1) {
+    device_id = platform::GetCurrentNPUDeviceId();
+  }
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
+      pool.Get(platform::NPUPlace(device_id)));
+  return dev_ctx->stream();
+}
+
+NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) {
+  attr_ = aclopCreateAttr();
+}
+
+NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
+                         const std::vector<Tensor> &outputs,
+                         const NPUAttributeMap &attrs)
+    : op_type_(op_type) {
+  attr_ = aclopCreateAttr();
+  AddInputs(inputs);
+  AddOutputs(outputs);
+  AddAttrs(attrs);
+}
+
+NpuOpRunner::~NpuOpRunner() {
+  // TODO(zhiqiu): handle free
+}
+
+const std::string &NpuOpRunner::Type() { return op_type_; }
+
+NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
+                                  const NPUAttribute &attr) {
+  if (attr.type() == typeid(bool)) {
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr)));
+  } else if (attr.type() == typeid(int)) {
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int, attr)));
+
+  } else if (attr.type() == typeid(int64_t)) {
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr)));
+  } else if (attr.type() == typeid(float)) {
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrFloat(attr_, name.c_str(), BOOST_GET_CONST(float, attr)));
+  } else if (attr.type() == typeid(std::vector<bool>)) {
+    auto a = BOOST_GET_CONST(std::vector<bool>, attr);
+    std::vector<uint8_t> cast_a;
+    for (auto it : a) {
+      cast_a.push_back(static_cast<uint8_t>(it));
+    }
+    PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListBool(
+        attr_, name.c_str(), cast_a.size(), cast_a.data()));
+  } else if (attr.type() == typeid(std::vector<int>)) {
+    auto a = BOOST_GET_CONST(std::vector<int>, attr);
+    std::vector<int64_t> cast_a;
+    for (auto it : a) {
+      cast_a.push_back(static_cast<int64_t>(it));
+    }
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrListInt(attr_, name.c_str(), cast_a.size(), cast_a.data()));
+  } else if (attr.type() == typeid(std::vector<int64_t>)) {
+    auto a = BOOST_GET_CONST(std::vector<int64_t>, attr);
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrListInt(attr_, name.c_str(), a.size(), a.data()));
+  } else if (attr.type() == typeid(std::vector<float>)) {
+    auto a = BOOST_GET_CONST(std::vector<float>, attr);
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrListFloat(attr_, name.c_str(), a.size(), a.data()));
+  } else if (attr.type() == typeid(std::string)) {
+    auto a = BOOST_GET_CONST(std::string, attr);
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrString(attr_, name.c_str(), a.c_str()));
+  } else if (attr.type() == typeid(std::vector<std::string>)) {
+    auto a = BOOST_GET_CONST(std::vector<std::string>, attr);
+    std::vector<const char *> s;
+    for (auto &it : a) {
+      s.push_back(it.data());
+    }
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclopSetAttrListString(attr_, name.c_str(), s.size(), s.data()));
+  } else if (attr.type() == typeid(std::vector<std::vector<int64_t>>)) {
+    auto a = BOOST_GET_CONST(std::vector<std::vector<int64_t>>, attr);
+    std::vector<int64_t *> data;
+    std::vector<int> num;
+    for (auto &&v : a) {
+      data.push_back(v.data());
+      num.push_back(v.size());
+    }
+    PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListListInt(
+        attr_, name.c_str(), data.size(), num.data(), data.data()));
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Can not convert attribubte '%s' to convert to aclopAttr", name));
+  }
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddAttrs(const NPUAttributeMap &attrs) {
+  for (const auto &pair : attrs) {
+    AddAttr(pair.first, pair.second);
+  }
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) {
+  // create aclTensorDesc
+  input_descs_.emplace_back(CreateTensorDesc(tensor));
+  // create aclDataBuffer
+  input_buffers_.emplace_back(CreateDataBuffer(tensor));
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
+  // create aclTensorDesc
+  output_descs_.emplace_back(CreateTensorDesc(tensor));
+  // create aclDataBuffer
+  output_buffers_.emplace_back(CreateDataBuffer(tensor));
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
+  for (auto tensor : tensors) {
+    // create aclTensorDesc
+    input_descs_.emplace_back(CreateTensorDesc(tensor));
+    // create aclDataBuffer
+    input_buffers_.emplace_back(CreateDataBuffer(tensor));
+  }
+  return *this;
+}
+
+// NOTE(zhiqiu): For operators whose input is a list (such as concat, stack),
+// It is needed to set the name of each input tensor.
+NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) {
+  PADDLE_ENFORCE_EQ(names.size(), input_descs_.size(),
+                    platform::errors::InvalidArgument(
+                        "The size of input names should be "
+                        "equal to the size of input descs, but got the size "
+                        "of input names is %d, the size of input descs is %d.",
+                        names.size(), input_descs_.size()));
+  for (size_t i = 0; i < names.size(); ++i) {
+    aclSetTensorDescName(input_descs_[i], names[i].c_str());
+  }
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
+  for (auto tensor : tensors) {
+    // create aclTensorDesc
+    output_descs_.emplace_back(CreateTensorDesc(tensor));
+    // create aclDataBuffer
+    output_buffers_.emplace_back(CreateDataBuffer(tensor));
+  }
+  return *this;
+}
+
+aclTensorDesc *NpuOpRunner::GetInputDesc(size_t index) {
+  PADDLE_ENFORCE_LT(index, input_descs_.size(),
+                    platform::errors::OutOfRange(
+                        "The index should be less than the size of inputs of "
+                        "operator %s, but got index is %d and size is %d",
+                        Type(), index, input_descs_.size()));
+  return input_descs_[index];
+}
+
+aclTensorDesc *NpuOpRunner::GetOutputDesc(size_t index) {
+  PADDLE_ENFORCE_LT(index, output_descs_.size(),
+                    platform::errors::OutOfRange(
+                        "The index should be less than the size of output of "
+                        "operator %s, but got index is %d and size is %d",
+                        Type(), index, output_descs_.size()));
+  return output_descs_[index];
+}
+
+std::vector<aclTensorDesc *> &NpuOpRunner::GetInputDescs() {
+  return input_descs_;
+}
+
+std::vector<aclTensorDesc *> &NpuOpRunner::GetOutputDescs() {
+  return output_descs_;
+}
+
+std::vector<aclDataBuffer *> &NpuOpRunner::GetInputBuffers() {
+  return input_buffers_;
+}
+
+std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() {
+  return output_buffers_;
+}
+
+aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
+  auto dtype = ConvertToNpuDtype(tensor.type());
+  auto format = ConvertToNpuFormat(tensor.layout());
+  auto dims = framework::vectorize(tensor.dims());
+
+  VLOG(4) << "NPU dtype:" << dtype << " "
+          << "rank:" << dims.size() << " dims:" << tensor.dims()
+          << " format:" << format;
+
+  auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format);
+  PADDLE_ENFORCE_NOT_NULL(
+      desc, platform::errors::External("Call aclCreateTensorDesc failed."));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format));
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclSetTensorStorageShape(desc, dims.size(), dims.data()));
+  return desc;
+}
+
+aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
+  void *ptr = tensor.data<void>();
+  VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.memory_size();
+  auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size());
+  PADDLE_ENFORCE_NOT_NULL(
+      buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
+  return buffer;
+}
+
+void NpuOpRunner::Run(aclrtStream stream) {
+  if (!stream) {
+    VLOG(4) << "Run with default current npu stream: " << stream;
+    stream = GetCurrentNPUStream();
+  }
+
+  VLOG(4) << "op_type: " << op_type_;
+  VLOG(4) << "input_desc.size: " << input_descs_.size();
+  VLOG(4) << "output_desc.size: " << output_descs_.size();
+  VLOG(4) << "attr: " << attr_;
+  VLOG(4) << "stream: " << stream;
+
+  aclError ret = aclopCompileAndExecute(
+      op_type_.c_str(), input_descs_.size(), input_descs_.data(),
+      input_buffers_.data(), output_descs_.size(), output_descs_.data(),
+      output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL,
+      stream);
+  VLOG(4) << "after aclopCompileAndExecute: " << ret;
+  PADDLE_ENFORCE_NPU_SUCCESS(ret);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfc933c7a76fa77dca3bf368a3e55cc1c7485bea
--- /dev/null
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -0,0 +1,133 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#pragma once
+#include <paddle/fluid/framework/operator.h>
+#include <paddle/fluid/framework/type_defs.h>
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+using NPUAttribute = framework::NPUAttribute;
+using NPUAttributeMap = framework::NPUAttributeMap;
+
+class NpuOpRunner {
+ public:
+  explicit NpuOpRunner(std::string op_type);
+  explicit NpuOpRunner(std::string op_type,
+                       const std::vector<Tensor> &inputs = {},
+                       const std::vector<Tensor> &outputs = {},
+                       const NPUAttributeMap &attrs = {});
+
+  ~NpuOpRunner();
+
+  const std::string &Type();
+
+  NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr);
+
+  NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs);
+
+  NpuOpRunner &AddInput(const Tensor &tensor);
+
+  NpuOpRunner &AddOutput(const Tensor &tensor);
+
+  NpuOpRunner &AddInputs(const std::vector<Tensor> &tensors);
+
+  NpuOpRunner &AddInputNames(const std::vector<std::string> &names);
+
+  NpuOpRunner &AddOutputs(const std::vector<Tensor> &tensors);
+
+  aclTensorDesc *GetInputDesc(size_t index);
+
+  aclTensorDesc *GetOutputDesc(size_t index);
+
+  std::vector<aclTensorDesc *> &GetInputDescs();
+
+  std::vector<aclTensorDesc *> &GetOutputDescs();
+
+  std::vector<aclDataBuffer *> &GetInputBuffers();
+
+  std::vector<aclDataBuffer *> &GetOutputBuffers();
+
+  void Run(aclrtStream stream = nullptr);
+
+ private:
+  aclTensorDesc *CreateTensorDesc(Tensor tensor);
+  aclDataBuffer *CreateDataBuffer(Tensor tensor);
+
+ private:
+  std::string op_type_;
+  std::vector<aclDataBuffer *> input_buffers_;
+  std::vector<aclDataBuffer *> output_buffers_;
+  std::vector<aclTensorDesc *> input_descs_;
+  std::vector<aclTensorDesc *> output_descs_;
+  aclopAttr *attr_{nullptr};
+};
+
+aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);
+
+aclrtStream GetCurrentNPUStream(int device_id = -1);
+
+template <typename T>
+void FillNpuTensorWithConstant(Tensor *tensor, T val) {
+  // NOTE(zhiqiu): we found that power sometimes returns 0 when val is small
+  // like 1e-8.
+  constexpr float MIN_PRECISION_FOR_POWER = 1e-3;
+  PADDLE_ENFORCE_EQ(
+      tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument("The tensor should be initialized."));
+  PADDLE_ENFORCE_EQ(
+      platform::is_npu_place(tensor->place()), true,
+      platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
+  // do async for better performance
+  if ((typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) &&
+      static_cast<float>(val) > MIN_PRECISION_FOR_POWER) {
+    Tensor tmp(tensor->type());
+    tmp.Resize(tensor->dims());
+    tmp.mutable_data<T>(tensor->place());
+    auto stream = GetCurrentNPUStream(
+        BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
+    platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
+                             stream);
+    auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
+                              {{"power", static_cast<float>(1)},
+                               {"scale", static_cast<float>(0)},
+                               {"shift", static_cast<float>(val)}});
+    runner.Run(stream);
+  } else {
+    T *array = new T[tensor->numel()];
+    for (unsigned int i = 0; i < tensor->numel(); ++i) {
+      array[i] = static_cast<T>(val);
+    }
+    std::vector<T> vec(tensor->numel(), static_cast<T>(val));
+    // do sync copy
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
+                 tensor->data<void>(), platform::CPUPlace(), array,
+                 tensor->numel() * sizeof(T), nullptr);
+    delete[] array;
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 9c321832f8489ab14881183b0f41dcdd07ab462b..64323e588c628542742a4233647cd425ae889b21 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/operators/one_hot_op.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/one_hot_op_xpu.cc b/paddle/fluid/operators/one_hot_op_xpu.cc
index 14ecd11d114d0b65a65989e38a43fc78dc765c53..3e214aa8bf8221c084237ba5e8a36b18d989ae59 100644
--- a/paddle/fluid/operators/one_hot_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_op_xpu.cc
@@ -16,7 +16,6 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/one_hot_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc
index 29fe6f10c72f435627ae9b2ab0ce7c44f5ee9df7..c42db1e6f449c09cc9117a78092454edd95c6223 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cc
+++ b/paddle/fluid/operators/one_hot_v2_op.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/operators/one_hot_v2_op.h"
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/framework.pb.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/one_hot_v2_op_xpu.cc b/paddle/fluid/operators/one_hot_v2_op_xpu.cc
index 6fec597db1729076e9e26ddf72a13ed402538905..e24be3bead688991dac22b0359c030f11ee677ea 100644
--- a/paddle/fluid/operators/one_hot_v2_op_xpu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_xpu.cc
@@ -16,7 +16,6 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/one_hot_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index 621920731fb603c3f3fd526c19b51d7c08d6c954..a7886cdd670d4d75ec1e5468dd059a8fbea081c6 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -151,6 +151,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
              "as beta2, this has a higher priority than attr(beta2), the "
              "shape of this tensor MUST BE [1].")
         .AsDispensable();
+    AddInput("EpsilonTensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as epsilon, this has a higher priority than attr(epsilon), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
     AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
 
     AddOutput("ParamOut", "(Tensor) Output parameter");
@@ -232,4 +237,13 @@ REGISTER_OP_VERSION(adam)
         paddle::framework::compatible::OpVersionDesc().NewAttr(
             "multi_precision",
             "(bool) Whether to use multi-precision during weight updating.",
-            false));
+            false))
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade adam, add 1 dispensable input [EpsilonTensor].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "EpsilonTensor",
+            "If provided, Adam will use this as epsilon, "
+            "this has a higher priority than attr(epsilon). "
+            "For better performance in npu kernel. "));
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 54aea67f4ea1b3b3939702a962d9aed773416273..3d6f0f99a52dfbfd917ef2d8ef4a3d2524edb8cc 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -154,7 +154,7 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
-    MPDType epsilon = static_cast<MPDType>(ctx.Attr<float>("epsilon"));
+
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
     auto* mom1 = ctx.Input<LoDTensor>("Moment1");
@@ -188,6 +188,15 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
                             beta2_tensor->numel()));
       beta2 = static_cast<MPDType>(GetAttrFromTensor(beta2_tensor));
     }
+    MPDType epsilon = static_cast<MPDType>(ctx.Attr<float>("epsilon"));
+    if (ctx.HasInput("EpsilonTensor")) {
+      auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(EpsilonTensor) size must be 1, but get %d",
+                            epsilon_tensor->numel()));
+      epsilon = static_cast<MPDType>(GetAttrFromTensor(epsilon_tensor));
+    }
     VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
             << "beta2_pow.numel() : " << beta2_pow->numel();
     VLOG(3) << "param.numel(): " << param->numel();
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 6356911f0676a84798aafcbc596f5e7bc0174584..9667db8055b90cd5bf47060dc2cefe5e9b8477fe 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -406,7 +406,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
     auto* mom1 = ctx.Input<LoDTensor>("Moment1");
@@ -440,6 +440,15 @@ class AdamOpKernel : public framework::OpKernel<T> {
                             beta2_tensor->numel()));
       beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
     }
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    if (ctx.HasInput("EpsilonTensor")) {
+      auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(EpsilonTensor) size must be 1, but get %d",
+                            epsilon_tensor->numel()));
+      epsilon = static_cast<T>(GetAttrFromTensor(epsilon_tensor));
+    }
     VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
             << "beta2_pow.numel() : " << beta2_pow->numel();
     VLOG(3) << "param.numel(): " << param->numel();
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..806e0fda07b592206ff35a463e6e9030d3a963c3
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -0,0 +1,193 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class AdamNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* param_var = ctx.InputVar("Param");
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
+    auto* param = ctx.Input<LoDTensor>("Param");
+    auto* grad_var = ctx.InputVar("Grad");
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Grad(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(param_var->Type())));
+    auto* grad = ctx.Input<LoDTensor>("Grad");
+    auto* mom1 = ctx.Input<LoDTensor>("Moment1");
+    auto* mom2 = ctx.Input<LoDTensor>("Moment2");
+    auto* lr = ctx.Input<LoDTensor>("LearningRate");
+
+    auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
+    auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
+
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
+    auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
+    auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
+    auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    mom1_out->mutable_data<T>(ctx.GetPlace());
+    mom2_out->mutable_data<T>(ctx.GetPlace());
+
+    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
+    if (beta1_pow->place() == platform::CPUPlace()) {
+      T beta1 = *beta1_pow->data<T>();
+      // `mutable_data` operation needs to be done after getting data
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
+    } else {
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+    }
+    if (beta2_pow->place() == platform::CPUPlace()) {
+      T beta2 = *beta2_pow->data<T>();
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
+    } else {
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+    }
+
+    const Tensor* beta1_tensor = nullptr;
+    const Tensor* beta2_tensor = nullptr;
+    const Tensor* epsilon_tensor = nullptr;
+
+    Tensor beta1_tmp(framework::proto::VarType::FP32);
+    Tensor beta2_tmp(framework::proto::VarType::FP32);
+    Tensor epsilon_tmp(framework::proto::VarType::FP32);
+
+    if (ctx.HasInput("Beta1Tensor")) {
+      beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta1Tensor) size must be 1, but get %d",
+                            beta1_tensor->numel()));
+    } else {
+      T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+      beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta1_tmp, beta1);
+      beta1_tensor = &beta1_tmp;
+    }
+
+    if (ctx.HasInput("Beta2Tensor")) {
+      beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(Beta2Tensor) size must be 1, but get %d",
+                            beta2_tensor->numel()));
+    } else {
+      T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+      beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta2_tmp, beta2);
+      beta2_tensor = &beta2_tmp;
+    }
+
+    if (ctx.HasInput("EpsilonTensor")) {
+      epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(EpsilonTensor) size must be 1, but get %d",
+                            epsilon_tensor->numel()));
+    } else {
+      T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+      epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
+      epsilon_tensor = &epsilon_tmp;
+    }
+
+    VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
+            << "beta2_pow.numel() : " << beta2_pow->numel();
+    VLOG(3) << "param.numel(): " << param->numel();
+
+    PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta1 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta1_pow_out->numel()));
+
+    PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "beta2 pow output size should be 1, but received "
+                          "value is:%d.",
+                          beta2_pow_out->numel()));
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner =
+        NpuOpRunner("ApplyAdamD",
+                    {
+                        *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr,
+                        *beta1_tensor, *beta2_tensor, *epsilon_tensor, *grad,
+                    },
+                    {
+                        *param_out, *mom1_out, *mom2_out,
+                    },
+                    {});
+    runner.Run(stream);
+
+    // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
+    // if param and param_out is not same, we need to do copy.
+    if (param_out->data<T>() != param->data<T>()) {
+      framework::TensorCopy(
+          *param, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), param_out);
+    }
+    if (mom1_out->data<T>() != mom1->data<T>()) {
+      framework::TensorCopy(
+          *mom1, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), mom1_out);
+    }
+    if (mom2_out->data<T>() != mom2->data<T>()) {
+      framework::TensorCopy(
+          *mom2, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), mom2_out);
+    }
+    auto runner_m1 =
+        NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
+    runner_m1.Run(stream);
+    auto runner_m2 =
+        NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
+    runner_m2.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    adam, ops::AdamNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AdamNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 1740f2982b6f39b29e4d54e7cffd5fd2081ba99b..09f117374499b07dfc4a227d2c22e8fa23c5d1f6 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -35,8 +35,6 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
                           framework::ToTypeName(param_var->Type())));
     using paddle::framework::LoDTensor;
 
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
     auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
                                   "Param", "Adam");
     // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
@@ -85,6 +83,11 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
       auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
       beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
     }
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    if (ctx.HasInput("EpsilonTensor")) {
+      auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      epsilon = static_cast<T>(GetAttrFromTensor(epsilon_tensor));
+    }
     if (grad_var->IsType<framework::LoDTensor>()) {
       auto& grad = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Grad"), "Input",
                                    "Grad", "Adam");
@@ -121,19 +124,25 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
       } else {
         T cpu_beta1_pow_out_data;
         T cpu_beta2_pow_out_data;
-        xpu_memcpy(&cpu_beta1_pow_out_data, beta1_pow_ptr, sizeof(T),
-                   XPU_DEVICE_TO_HOST);
+        memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data,
+                     BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()),
+                     beta1_pow_ptr, sizeof(T));
+
         cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
-        xpu_memcpy(&cpu_beta2_pow_out_data, beta2_pow_ptr, sizeof(T),
-                   XPU_DEVICE_TO_HOST);
+        memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data,
+                     BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()),
+                     beta2_pow_ptr, sizeof(T));
+
         cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
 
         T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
         T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-        xpu_memcpy(beta1_pow_out_p, &cpu_beta1_pow_out_data, sizeof(T),
-                   XPU_HOST_TO_DEVICE);
-        xpu_memcpy(beta2_pow_out_p, &cpu_beta2_pow_out_data, sizeof(T),
-                   XPU_HOST_TO_DEVICE);
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                     beta1_pow_out_p, platform::CPUPlace(),
+                     &cpu_beta1_pow_out_data, sizeof(T));
+        memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
+                     beta2_pow_out_p, platform::CPUPlace(),
+                     &cpu_beta2_pow_out_data, sizeof(T));
       }
 
       PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 569dbcd6a3ee105ae8dd8570bbb2215c32343d01..9603411ec4513a585f19d72be2902effe74d360c 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include <string>
+
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+
 namespace paddle {
 namespace operators {
 
@@ -127,4 +129,6 @@ REGISTER_OPERATOR(
     ops::SGDOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
     sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SGDOpKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::bfloat16>,
     ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 1aaf95efc3250747a4cb46ab79b4415a6b527907..076121c0e27da7f8292de272bdb8ea38fdf33a0d 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -13,14 +13,220 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
 namespace operators {
 
+namespace detail {
+
+template <typename T, int VariableTypeId>
+struct sgd_dense_param_kernel {
+  void operator()() const {}
+};
+
+// LodTensor
+template <typename T>
+struct sgd_dense_param_kernel<
+    T, framework::VarTypeTrait<framework::LoDTensor>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, LoDTensor>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+
+    const auto sz = param_out->numel();
+    jit::sgd_attr_t attr(1, sz, 1, sz, 1);
+    const T *lr = learning_rate->data<T>();
+    const T *param_data = param->data<T>();
+    const T *grad_data = grad->data<T>();
+    int64_t rows_idx = 0;
+    T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto sgd =
+        jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+            attr);
+    sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
+  }
+};
+
+// SelectedRows
+template <typename T>
+struct sgd_dense_param_kernel<
+    T, framework::VarTypeTrait<framework::SelectedRows>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<T, SelectedRows>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    const auto &grad_value = grad->value();
+    const auto &grad_rows = grad->rows();
+    const T *param_data = param->data<T>();
+    const T *grad_data = grad_value.data<T>();
+    const T *lr = learning_rate->data<T>();
+    const int64_t *rows_data = grad_rows.data();
+    T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
+
+    jit::sgd_attr_t attr;
+    attr.param_height = param_out->dims()[0];
+    attr.param_width = param_out->numel() / attr.param_height;
+    attr.grad_height = grad_rows.size();  // note: it is not grad->height()
+    attr.grad_width = grad_value.numel() / attr.grad_height;
+    attr.selected_rows_size = grad_rows.size();
+
+    auto sgd =
+        jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
+            attr);
+    sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
+  }
+};
+
+// LodTensor
+template <>
+struct sgd_dense_param_kernel<
+    platform::bfloat16, framework::VarTypeTrait<framework::LoDTensor>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, LoDTensor>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    const auto *param = ctx.Input<framework::Tensor>("Param");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    param_out->mutable_data<platform::bfloat16>(ctx.GetPlace());
+
+    auto p = framework::EigenVector<platform::bfloat16>::Flatten(*param);
+    auto g = framework::EigenVector<platform::bfloat16>::Flatten(*grad);
+    auto o = framework::EigenVector<platform::bfloat16>::Flatten(*param_out);
+    const auto *lr = learning_rate->data<platform::bfloat16>();
+
+    o = p - lr[0] * g;
+  }
+};
+
+// SelectedRows
+template <>
+struct sgd_dense_param_kernel<
+    platform::bfloat16, framework::VarTypeTrait<framework::SelectedRows>::kId> {
+  void operator()(const framework::ExecutionContext &ctx) const {
+    VLOG(4) << "[CPU]: sgd_dense_param_kernel<bfloat16, SelectedRows>";
+    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    const auto &grad_value = grad->value();
+    const auto &grad_rows = grad->rows();
+    const auto grad_height = grad->height();
+    const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
+    const auto grad_width = grad_value.numel() / grad_val_height;
+
+    const auto *grad_data = grad_value.data<platform::bfloat16>();
+    auto *out_data = param_out->data<platform::bfloat16>();
+    const auto *lr = learning_rate->data<platform::bfloat16>();
+
+    for (size_t i = 0; i < grad_rows.size(); ++i) {
+      PADDLE_ENFORCE_LT(
+          grad_rows[i], grad_height,
+          platform::errors::OutOfRange(
+              "Grad rows index value should be less than grad height."
+              "Got [%s], but expected less than [%s]",
+              grad_rows[i], grad_height));
+      const int64_t row = grad_rows[i];
+      for (int64_t j = 0; j < grad_width; ++j) {
+        out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
+      }
+    }
+  }
+};
+
+template <typename T>
+void sgd_op_invoke_dense_param_kernel(const framework::ExecutionContext &ctx) {
+  const auto *param = ctx.Input<framework::Tensor>("Param");
+  auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
+  const auto *grad_var = ctx.InputVar("Grad");
+
+  if (grad_var->IsType<framework::LoDTensor>()) {
+    const auto *grad = ctx.Input<framework::Tensor>("Grad");
+    const auto sz = param_out->numel();
+    PADDLE_ENFORCE_EQ(param->numel(), sz,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Param's numel of SgdOp "
+                          "should be equal with ParamOut's numel. "
+                          "But received Param's "
+                          "numel = [%s], ParamOut's numel = [%s]",
+                          param->numel(), sz));
+    PADDLE_ENFORCE_EQ(grad->numel(), sz,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Grad's numel of SgdOp "
+                          "should be equal with ParamOut's numel. "
+                          "But received Grad's "
+                          "numel = [%s], ParamOut's numel = [%s]",
+                          grad->numel(), sz));
+
+    sgd_dense_param_kernel<
+        T, framework::VarTypeTrait<framework::LoDTensor>::kId>()(ctx);
+  } else if (grad_var->IsType<framework::SelectedRows>()) {
+    // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+    // This manual optimization brings difficulty to track data dependency.
+    // It's better to find a more elegant solution.
+    PADDLE_ENFORCE_EQ(param, param_out,
+                      platform::errors::InvalidArgument(
+                          "The input tensor Param of SgdOp "
+                          "should be equal with ParamOut if variable's "
+                          "type is SelectedRows. "));
+    const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
+
+    // for distributed training, a sparse var may be empty,
+    // just skip updating.
+    if (grad->rows().size() == 0) {
+      return;
+    }
+
+    auto out_dims = param_out->dims();
+    PADDLE_ENFORCE_EQ(
+        grad->height(), out_dims[0],
+        platform::errors::InvalidArgument(
+            "The input tensor Grad's height of SgdOp "
+            "should be equal with ParamOut's dims. But received  Grad's "
+            "height [%s] and ParamOut's dims [%s]",
+            grad->height(), out_dims[0]));
+
+    auto &grad_value = grad->value();
+    auto &grad_rows = grad->rows();
+    const auto param_height = param_out->dims()[0];
+    const auto param_width = param_out->numel() / param_height;
+    // note: it is not grad->height()
+    const auto grad_height = static_cast<int64_t>(grad_rows.size());
+    const auto grad_width = grad_value.numel() / grad_height;
+
+    PADDLE_ENFORCE_EQ(
+        grad_width, param_width,
+        platform::errors::InvalidArgument(
+            "The grad_value's numel of SgdOp "
+            "should be equal with param_out's numel. But received "
+            "grad_value's numel [%s] and param_out's numel [%s]",
+            grad_width, param_width));
+
+    sgd_dense_param_kernel<
+        T, framework::VarTypeTrait<framework::SelectedRows>::kId>()(ctx);
+  } else {
+    PADDLE_ENFORCE_EQ(
+        false, true, platform::errors::PermissionDenied(
+                         "Unsupported Variable Type of Grad in SgdOp. Excepted "
+                         "LodTensor or SelectedRows, But received [%s]",
+                         paddle::framework::ToTypeName(grad_var->Type())));
+  }
+}
+
+}  // namespace detail
+
 template <typename DeviceContext, typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
  public:
@@ -38,102 +244,12 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
     const auto *grad_var = ctx.InputVar("Grad");
 
     if (param_var->IsType<framework::LoDTensor>()) {
-      const auto *param = ctx.Input<framework::Tensor>("Param");
-      auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-      // Actually, all tensors are LoDTensor except SelectedRows.
-      if (grad_var->IsType<framework::LoDTensor>()) {
-        const auto *grad = ctx.Input<framework::Tensor>("Grad");
-        auto sz = param_out->numel();
-        PADDLE_ENFORCE_EQ(param->numel(), sz,
-                          platform::errors::InvalidArgument(
-                              "The input tensor Param's numel of SgdOp "
-                              "should be equal with ParamOut's numel. "
-                              "But received Param's "
-                              "numel = [%s], ParamOut's numel = [%s]",
-                              param->numel(), sz));
-        PADDLE_ENFORCE_EQ(grad->numel(), sz,
-                          platform::errors::InvalidArgument(
-                              "The input tensor Grad's numel of SgdOp "
-                              "should be equal with ParamOut's numel. "
-                              "But received Grad's "
-                              "numel = [%s], ParamOut's numel = [%s]",
-                              grad->numel(), sz));
-
-        jit::sgd_attr_t attr(1, sz, 1, sz, 1);
-        const T *lr = learning_rate->data<T>();
-        const T *param_data = param->data<T>();
-        const T *grad_data = grad->data<T>();
-        int64_t rows_idx = 0;
-        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-
-        auto sgd =
-            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
-                attr);
-        sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
-      } else if (grad_var->IsType<framework::SelectedRows>()) {
-        // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
-        // This manual optimization brings difficulty to track data dependency.
-        // It's better to find a more elegant solution.
-        PADDLE_ENFORCE_EQ(param, param_out,
-                          platform::errors::InvalidArgument(
-                              "The input tensor Param of SgdOp "
-                              "should be equal with ParamOut if variable's "
-                              "type is SelectedRows. "));
-        const auto *grad = ctx.Input<framework::SelectedRows>("Grad");
-        auto &grad_rows = grad->rows();
-
-        // for distributed training, a sparse var may be empty,
-        // just skip updating.
-        if (grad_rows.size() == 0) {
-          return;
-        }
-
-        auto out_dims = param_out->dims();
-        PADDLE_ENFORCE_EQ(
-            grad->height(), out_dims[0],
-            platform::errors::InvalidArgument(
-                "The input tensor Grad's height of SgdOp "
-                "should be equal with ParamOut's dims. But received  Grad's "
-                "height [%s] and ParamOut's dims [%s]",
-                grad->height(), out_dims[0]));
-        auto &grad_value = grad->value();
-        const T *param_data = param->data<T>();
-        const T *grad_data = grad_value.data<T>();
-        const T *lr = learning_rate->data<T>();
-        const int64_t *rows_data = grad_rows.data();
-        T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-
-        jit::sgd_attr_t attr;
-        attr.param_height = out_dims[0];
-        attr.param_width = param_out->numel() / attr.param_height;
-        attr.grad_height = grad_rows.size();  // note: it is not grad->height()
-        attr.grad_width = grad_value.numel() / attr.grad_height;
-        attr.selected_rows_size = grad_rows.size();
-        PADDLE_ENFORCE_EQ(
-            attr.grad_width, attr.param_width,
-            platform::errors::InvalidArgument(
-                "The grad_value's numel of SgdOp "
-                "should be equal with param_out's numel. But received "
-                "grad_value's numel [%s] and param_out's numel [%s]",
-                attr.grad_width, attr.param_width));
-
-        auto sgd =
-            jit::KernelFuncs<jit::SgdTuple<T>, platform::CPUPlace>::Cache().At(
-                attr);
-        sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
-      } else {
-        PADDLE_ENFORCE_EQ(
-            false, true,
-            platform::errors::PermissionDenied(
-                "Unsupported Variable Type of Grad in SgdOp. Excepted "
-                "LodTensor or SelectedRows, But received [%s]",
-                paddle::framework::ToTypeName(grad_var->Type())));
-      }
+      detail::sgd_op_invoke_dense_param_kernel<T>(ctx);
     } else if (param_var->IsType<framework::SelectedRows>()) {
       PADDLE_ENFORCE_EQ(grad_var->IsType<framework::SelectedRows>(), true,
                         platform::errors::InvalidArgument(
-                            "when param is SelectedRows, "
-                            "gradient should also be SelectedRows"));
+                            "When param is SelectedRows, gradient should also "
+                            "be SelectedRows"));
       const auto &param = param_var->Get<framework::SelectedRows>();
       auto *param_out = ctx.Output<framework::SelectedRows>("ParamOut");
       const auto &grad = grad_var->Get<framework::SelectedRows>();
@@ -179,5 +295,6 @@ class SGDOpKernel<platform::CPUDeviceContext, T>
     }
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8d19148ef520cc2b80b23e119e56f5a7b6f920f
--- /dev/null
+++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SGDNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
+    auto* param_var = ctx.Input<framework::LoDTensor>("Param");
+    auto* grad_var = ctx.Input<framework::LoDTensor>("Grad");
+    auto* param_out = ctx.Output<framework::LoDTensor>("ParamOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner =
+        NpuOpRunner("ApplyGradientDescent",
+                    {*param_var, *learning_rate, *grad_var}, {*param_out}, {});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+
+    // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
+    // if param and param_out is not same, we need to do copy.
+    if (param_out->data<T>() != param_var->data<T>()) {
+      framework::TensorCopy(
+          *param_var, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), param_out);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    sgd, ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SGDNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index 918f0bb1e49d681743a5e831755e8d5d3a58b4b7..bd6694abdbf763db4f9d65809f22ad16f5457959 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -142,7 +142,12 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
+
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
@@ -244,7 +249,12 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.cuda_device_context();
 
+#ifdef __HIPCC__
+    const int block = 256;
+#else
     const int block = 512;
+#endif
+
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid = std::min(max_blocks, pre * post);
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 8ceb22d8cc4c33babb19fa63571a9fd1793c4355..1bdb3728f538e2fab1851234eb277ceead8c60d5 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/miopen_helper.h"
 #endif
 
@@ -264,6 +266,34 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
     std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
     const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
+#ifdef PADDLE_WITH_HIP
+    if (pooling_type == "max") {
+      using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap;
+      using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc;
+      auto &all_op_kernels =
+          paddle::framework::OperatorWithKernel::AllOpKernels();
+      std::string op_type = "pool2d_grad";
+      auto kernels_iter = all_op_kernels.find(op_type);
+      PADDLE_ENFORCE_NE(
+          kernels_iter, all_op_kernels.end(),
+          platform::errors::Unavailable(
+              "There are no kernels which are registered in the %s operator.",
+              op_type));
+      OpKernelMap &kernels = kernels_iter->second;
+      paddle::framework::OpKernelType expected_kernel_key(
+          paddle::framework::ToDataType(typeid(T)), ctx.GetPlace());
+      auto kernel_iter = kernels.find(expected_kernel_key);
+      PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+                        platform::errors::NotFound(
+                            "Operator (%s) does not have kernel for %s.",
+                            op_type, KernelTypeToString(expected_kernel_key)));
+      std::unique_ptr<OpKernelFunc> kernel_func_(
+          new OpKernelFunc(kernel_iter->second));
+      (*kernel_func_)(ctx);
+      return;
+    }
+#endif
+
     // update paddings
     auto in_x_dims = input->dims();
     framework::DDim data_dims;
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index 159bdcabd657b0cdacba5a8b846656e9038b54c6..277c93fad6aa83df21fa918013a03d8e91e5b29e 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -119,6 +119,11 @@ class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Output data type")
         .SetDefault(framework::proto::VarType::FP32);
 
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training.")
+        .SetDefault(false);
+
     AddComment(R"DOC(
 Lookup Tablel Prefetch Operator.
 This operator is used to perform lookup on parameter W,
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index 0f1a096e207692e2bb901ec3235875ece4fa0eab..413b4ab358536c1efd9a5c875afeb05c231202a9 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -30,6 +30,7 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
 
     auto padding_idx = context.Attr<int64_t>("padding_idx");
     auto table_id = context.Attr<int>("table_id");
+    bool is_test = context.Attr<bool>("is_test");
 
     auto embedding_name = context.InputNames("W").front();
     int64_t emb_dim = 0;
@@ -55,7 +56,8 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
     if (platform::is_cpu_place(context.GetPlace())) {
       fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
                                     static_cast<uint64_t>(padding_idx),
-                                    context.GetPlace(), &inputs, &outputs);
+                                    context.GetPlace(), !is_test, &inputs,
+                                    &outputs);
     } else {
       auto inputs_variable = context.MultiInputVar("Ids");
       auto outputs_variable = context.MultiOutputVar("Outputs");
@@ -93,7 +95,8 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
       // use fleet->PullSparse
       fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
                                     static_cast<uint64_t>(padding_idx),
-                                    cpu_place, &tmp_input_vec, &tmp_output_vec);
+                                    cpu_place, !is_test, &tmp_input_vec,
+                                    &tmp_output_vec);
 
       // cp temp to origin
       for (size_t idx = 0; idx < output_var_size; ++idx) {
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index 1d072936f409cf34042ec342ca4a04aaddda3f80..df2eb70b144e4a3cd14384cd4077f44950f89c92 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/service/heter_client.h"
 #include "paddle/fluid/distributed/service/heter_server.h"
+#include "paddle/fluid/framework/op_registry.h"
+
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace distributed = paddle::distributed;
diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h
index 48903012b595e74d66e3450d46a87286ef2dd78f..77021b8961db552f1e850eb04914bf2a963aeb7b 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@@ -47,8 +47,7 @@ static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
   box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths,
                       hidden_size, 0);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
   auto hidden_size = ctx.Attr<int>("size");
   auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
   gpu_ps_ptr->PullSparse(ctx.GetPlace(), 0, all_keys, all_values, slot_lengths,
@@ -91,8 +90,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
   box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values,
                           slot_lengths, hidden_size, 0, batch_size);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
   auto hidden_size = ctx.Attr<int>("size");
   auto gpu_ps_ptr = paddle::framework::PSGPUWrapper::GetInstance();
   gpu_ps_ptr->PushSparseGrad(ctx.GetPlace(), 0, all_keys, all_grad_values,
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index b3622870d070e635acacf82e1be9798ffd7e38e5..f676348bc0af2a8de573c9365b699aefb7f67a02 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -178,9 +178,10 @@ class PyFuncOpVarTypeInference : public framework::StaticGraphVarTypeInference {
 class PyFuncOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(!ctx->IsRuntime(), true,
-                      platform::errors::InvalidArgument(
-                          "Infer shape cannot be called in runtime."));
+    PADDLE_ENFORCE_EQ(
+        !ctx->IsRuntime(), true,
+        platform::errors::InvalidArgument("Shape inference cannot be called at "
+                                          "run time in 'py_func' operator."));
   }
 };
 
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f91496eeab142071fcf87c929cf1327d9b53808d
--- /dev/null
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -0,0 +1,224 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "paddle/fluid/operators/py_layer_op.h"
+
+namespace paddle {
+namespace operators {
+
+namespace py = ::pybind11;
+
+void RunPyObject(py::object *py_object,
+                 const std::vector<framework::Variable *> &ins,
+                 std::vector<framework::Variable *> *outs) {
+  py::gil_scoped_acquire guard;
+
+  auto py_function = py_object->attr("backward");
+
+  py::tuple inputs(ins.size());
+  for (size_t i = 0; i < ins.size(); i++) {
+    auto in_var = ins[i];
+    if (in_var != nullptr) {
+      auto name = paddle::string::Sprintf("generator_custom_py_layer_%d@GRAD",
+                                          static_cast<int>(i));
+
+      std::shared_ptr<imperative::VariableWrapper> temp_wrap =
+          std::make_shared<imperative::VariableWrapper>(name, *in_var);
+      temp_wrap->InnerSetOverridedStopGradient(true);
+      std::shared_ptr<imperative::VarBase> temp_varbase =
+          std::make_shared<imperative::VarBase>(temp_wrap);
+      try {
+        inputs[i] = py::cast(temp_varbase).ptr();
+      } catch (py::cast_error &) {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "The output of `PyLayer.backward` should be `Tensor`."));
+      }
+    }
+  }
+
+  auto py_result = py_function(*py_object, *inputs);
+
+  if (PyTuple_Check(py_result.ptr()) || PyList_Check(py_result.ptr())) {
+    auto result_tuple = py_result.cast<py::tuple>();
+    if (result_tuple.size() != outs->size()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The number of outputs of `PyLayer.backward` should be %d, but "
+          "received %d.",
+          outs->size(), result_tuple.size()));
+    }
+    for (size_t i = 0; i < result_tuple.size(); i++) {
+      if ((*outs)[i] != nullptr) {
+        if (Py_None != result_tuple[i].ptr()) {
+          try {
+            auto result_var =
+                result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
+            *(*outs)[i] = result_var->Var();
+          } catch (py::cast_error &) {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The output of `PyLayer.backward` should be `Tensor`."));
+          }
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The %dth input tensor of forward needs gradient and the "
+              "corresponding gradient cannot be None.",
+              i));
+        }
+      } else {
+        if (Py_None != result_tuple[i].ptr()) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The %dth input tensor of forward do not need gradient and the "
+              "corresponding gradient should be `None`.",
+              i));
+        }
+      }
+    }
+  } else {
+    if (1 != outs->size()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The number of outputs of `PyLayer.backward` should be %d, but "
+          "received 1.",
+          outs->size()));
+    }
+    if ((*outs)[0] != nullptr) {
+      if (Py_None != py_result.ptr()) {
+        try {
+          auto result_var =
+              py_result.cast<std::shared_ptr<imperative::VarBase>>();
+          *((*outs)[0]) = result_var->Var();
+        } catch (py::cast_error &) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The output of `PyLayer.backward` should be `Tensor`."));
+        }
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The input tensor of forward needs gradient, so the output of "
+            "`PyLayer.backward` can not be `None`."));
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The input tensor of forward do not need gradient, so the output of "
+          "`PyLayer.backward` should be `None`."));
+    }
+  }
+}
+
+void PyLayerGradOpMaker<paddle::imperative::OpBase>::Apply(
+    GradOpPtr<paddle::imperative::OpBase> grad_op) const {
+  grad_op->SetType("py_layer");
+  auto &inner_op = grad_op->InnerOp();
+  auto py_layer_op_const = dynamic_cast<const PyLayerOp *>(&inner_op);
+
+  if (py_layer_op_const) {
+    auto py_layer_op = const_cast<PyLayerOp *>(py_layer_op_const);
+    py_layer_op->SetPyLayerContext(py_context_);
+
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "PyLayerGradOpMaker can't cast %s to PyLayerOp*.",
+        typeid(&inner_op).name()));
+  }
+
+  auto fwd_out_grads = this->OutputGrad("Out");
+  using return_type = decltype(fwd_out_grads);
+  return_type bwd_ins;
+
+  bwd_ins.insert(bwd_ins.begin(), fwd_out_grads.begin(), fwd_out_grads.end());
+
+  auto bwd_outs = this->InputGrad("X", false);
+
+  grad_op->SetInput("X", bwd_ins);
+  grad_op->SetOutput("Out", bwd_outs);
+}
+
+class PyLayerOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Inputs of PyLayer op.").AsDuplicable();
+    AddOutput("Out", "Outputs of PyLayer op").AsDuplicable();
+    AddComment(R"DOC("PyLayer Op")DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PyLayerOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto &op_ = ctx.GetOp();
+    auto const_pylayer_op = dynamic_cast<const PyLayerOp *>(&op_);
+    if (const_pylayer_op) {
+      auto pylayer_op = const_cast<PyLayerOp *>(const_pylayer_op);
+
+      // Release contex after executing the compute
+      auto py_layer_context = pylayer_op->ReleasePyLayerContext();
+      py::object bk_ctx(py::handle(py_layer_context->GetMutableCtx()), true);
+      auto &input_vars = ctx.MultiInputVar("X");
+      auto output_vars = ctx.MultiOutputVar("Out");
+      RunPyObject(&bk_ctx, input_vars, &output_vars);
+
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "PyLayerOpKernel can't cast %s to PyLayer*.", typeid(&op_).name()));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(py_layer, ops::PyLayerOp, ops::PyLayerOpMaker,
+                  ops::PyLayerGradOpMaker<paddle::imperative::OpBase>,
+                  ops::PyLayerGradOpMaker<paddle::framework::OpDesc>);
+
+REGISTER_OP_CPU_KERNEL(
+    py_layer, ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::float16>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::bfloat16>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
+
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int16_t>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::complex64>,
+    ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
+                         ::paddle::platform::complex128>);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_OP_CUDA_KERNEL(
+    py_layer, ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::float16>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::bfloat16>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
+
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::complex64>,
+    ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
+                         ::paddle::platform::complex128>);
+#endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d80faab90b223622ef18b6244325206bb12156bf
--- /dev/null
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/python_headers.h"
+
+namespace paddle {
+namespace operators {
+namespace py = ::pybind11;
+
+class PyLayerContext {
+ public:
+  explicit PyLayerContext(PyObject* context) : context_(context) {
+    Py_INCREF(context_);
+  }
+
+  PyLayerContext() = delete;
+
+  PyObject* GetMutableCtx() { return context_; }
+  ~PyLayerContext() {
+    py::gil_scoped_acquire guard;
+    Py_XDECREF(context_);
+  }
+
+ private:
+  PyObject* context_;
+};
+
+class PyLayerOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    VLOG(3) << "`InferShape` of `PyLayer` is an empty function, and it cannot "
+               "infer the shape of the output tensors.";
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+
+ public:
+  void SetPyLayerContext(const std::shared_ptr<PyLayerContext>& py_context) {
+    py_context_ = py_context;
+  }
+  std::shared_ptr<PyLayerContext> ReleasePyLayerContext() {
+    auto temp = py_context_;
+    py_context_.reset();
+    VLOG(3) << "`py_context_` in the PyLayerOp is released.";
+    return temp;
+  }
+
+ private:
+  std::shared_ptr<PyLayerContext> py_context_;
+};
+
+template <typename T>
+class PyLayerGradOpMaker {};
+template <>
+class PyLayerGradOpMaker<paddle::framework::OpDesc>
+    : public framework::SingleGradOpMaker<paddle::framework::OpDesc> {
+ public:
+  using framework::SingleGradOpMaker<
+      paddle::framework::OpDesc>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<paddle::framework::OpDesc> grad_op) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "`PyLayer` don't support static graph mode."));
+  }
+};
+
+template <>
+class PyLayerGradOpMaker<paddle::imperative::OpBase>
+    : public framework::SingleGradOpMaker<paddle::imperative::OpBase> {
+ public:
+  using framework::SingleGradOpMaker<
+      paddle::imperative::OpBase>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<paddle::imperative::OpBase> grad_op) const override;
+
+ public:
+  void SetPyLayerContext(const std::shared_ptr<PyLayerContext>& py_context) {
+    py_context_ = py_context;
+  }
+
+ private:
+  std::shared_ptr<PyLayerContext> py_context_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu
index f2c78e0f70b321814b890d3a0b6e6dffb7cc689c..6250d68730e138f30cf14d664e51bbe7a506dbc2 100644
--- a/paddle/fluid/operators/range_op.cu
+++ b/paddle/fluid/operators/range_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/range_op.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
@@ -34,26 +35,9 @@ class CUDARangeKernel : public framework::OpKernel<T> {
     auto* step_t = context.Input<framework::Tensor>("Step");
     auto* out = context.Output<framework::Tensor>("Out");
 
-    T start, end, step;
-    framework::Tensor n;
-    if (::paddle::platform::is_cpu_place(start_t->place())) {
-      start = start_t->data<T>()[0];
-    } else {
-      framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
-      start = n.data<T>()[0];
-    }
-    if (::paddle::platform::is_cpu_place(end_t->place())) {
-      end = end_t->data<T>()[0];
-    } else {
-      framework::TensorCopy(*end_t, platform::CPUPlace(), &n);
-      end = n.data<T>()[0];
-    }
-    if (::paddle::platform::is_cpu_place(step_t->place())) {
-      step = step_t->data<T>()[0];
-    } else {
-      framework::TensorCopy(*step_t, platform::CPUPlace(), &n);
-      step = n.data<T>()[0];
-    }
+    T start = GetValue<T>(start_t);
+    T end = GetValue<T>(end_t);
+    T step = GetValue<T>(step_t);
 
     int64_t size = 0;
     GetSize(start, end, step, &size);
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index a793d12f522da5d1e697e3e36a193a2fedca1ed0..5344147a9069cc54e755b784f5a2d6ee660b1fa9 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -34,7 +34,7 @@ void GetSize(T start, T end, T step, int64_t* size) {
   if (start > end) {
     PADDLE_ENFORCE_LT(step, 0,
                       platform::errors::InvalidArgument(
-                          "step should be less than 0 while start > end."));
+                          "The step should be less than 0 while start > end."));
   }
 
   *size = std::is_integral<T>::value
diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a9a2effd2eb9db3feead8a9b82a84558220e9c5d
--- /dev/null
+++ b/paddle/fluid/operators/range_op_npu.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/range_op.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RangeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* end_t = context.Input<framework::Tensor>("End");
+    auto* step_t = context.Input<framework::Tensor>("Step");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopy(
+        *start_t, platform::CPUPlace(),
+        context.template device_context<platform::DeviceContext>(), &n);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
+    T start = n.data<T>()[0];
+    framework::TensorCopy(
+        *end_t, platform::CPUPlace(),
+        context.template device_context<platform::DeviceContext>(), &n);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
+    T end = n.data<T>()[0];
+    framework::TensorCopy(
+        *step_t, platform::CPUPlace(),
+        context.template device_context<platform::DeviceContext>(), &n);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
+    T step = n.data<T>()[0];
+
+    int64_t size = 0;
+    GetSize(start, end, step, &size);
+
+    out->Resize(framework::make_ddim({size}));
+    out->mutable_data<T>(context.GetPlace());
+
+    std::vector<T> odata;
+    T value = start;
+    for (int64_t i = 0; i < size; ++i) {
+      odata.push_back(value);
+      value += step;
+    }
+
+    framework::TensorFromVector(odata, context.device_context(), out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    range, ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::RangeNPUKernel<paddle::platform::NPUDeviceContext, double>)
+
+#endif
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f2f395314c0cc8ce1af237915ef23c7276570a87
--- /dev/null
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(range);
+USE_OP_DEVICE_KERNEL(range, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto start = scope->Var("Start");
+  auto tensor_start = start->GetMutable<f::LoDTensor>();
+  std::vector<T> init_start;
+  init_start.push_back(static_cast<T>(1));
+  TensorFromVector(init_start, ctx, tensor_start);
+  tensor_start->Resize({1});
+
+  auto end = scope->Var("End");
+  auto tensor_end = end->GetMutable<f::LoDTensor>();
+  std::vector<T> init_end;
+  init_end.push_back(static_cast<T>(10));
+  TensorFromVector(init_end, ctx, tensor_end);
+  tensor_end->Resize({1});
+
+  auto step = scope->Var("Step");
+  auto tensor_step = step->GetMutable<f::LoDTensor>();
+  std::vector<T> init_step;
+  init_step.push_back(static_cast<T>(2));
+  TensorFromVector(init_step, ctx, tensor_step);
+  tensor_step->Resize({1});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}},
+      {{"Out", {"Out"}}}, {});
+
+  op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  EXPECT_EQ(static_cast<T>(out_vec.size()), static_cast<T>(5));
+  EXPECT_EQ(static_cast<T>(out_vec[0]), static_cast<T>(1.0));
+  EXPECT_EQ(static_cast<T>(out_vec[1]), static_cast<T>(3.0));
+  EXPECT_EQ(static_cast<T>(out_vec[2]), static_cast<T>(5.0));
+  EXPECT_EQ(static_cast<T>(out_vec[3]), static_cast<T>(7.0));
+  EXPECT_EQ(static_cast<T>(out_vec[4]), static_cast<T>(9.0));
+}
+
+TEST(range, NPU) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<int>(&scope, *ctx, "range");
+}
diff --git a/paddle/fluid/operators/read_file_op.cc b/paddle/fluid/operators/read_file_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6da92ed7df7d8ea63ea015cd91783edcc4c5d81b
--- /dev/null
+++ b/paddle/fluid/operators/read_file_op.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPUReadFileKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto filename = ctx.Attr<std::string>("filename");
+
+    std::ifstream input(filename.c_str(),
+                        std::ios::in | std::ios::binary | std::ios::ate);
+    std::streamsize file_size = input.tellg();
+
+    input.seekg(0, std::ios::beg);
+
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int64_t> out_shape = {file_size};
+    out->Resize(framework::make_ddim(out_shape));
+
+    uint8_t* data = out->mutable_data<T>(ctx.GetPlace());
+
+    input.read(reinterpret_cast<char*>(data), file_size);
+  }
+};
+
+class ReadFileOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of ReadFileOp is null."));
+
+    auto out_dims = std::vector<int>(1, -1);
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::UINT8,
+                                   platform::CPUPlace());
+  }
+};
+
+class ReadFileOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("Out", "The output tensor of ReadFile op");
+    AddComment(R"DOC(
+This operator read a file.
+)DOC");
+    AddAttr<std::string>("filename", "Path of the file to be readed.")
+        .SetDefault({});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    read_file, ops::ReadFileOp, ops::ReadFileOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
+
+REGISTER_OP_CPU_KERNEL(read_file, ops::CPUReadFileKernel<uint8_t>)
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 8929da20b53c281d3c1602f68d88ce45acc07da8..f126070a7eb96ab38022e9dea932f4cfd62b40d9 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -45,7 +45,11 @@ class BlockingQueue {
     std::unique_lock<std::mutex> lock(mutex_);
     send_cv_.wait(
         lock, [&] { return queue_.size() < capacity_ || closed_ || killed_; });
-    EnforceNotKilled();
+    if (killed_) {
+      VLOG(3)
+          << "WARNING:: Sending an element to a killed reader::BlokcingQueue";
+      return false;
+    }
     if (closed_) {
       VLOG(5)
           << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
@@ -66,7 +70,11 @@ class BlockingQueue {
     std::unique_lock<std::mutex> lock(mutex_);
     send_cv_.wait(
         lock, [&] { return queue_.size() < capacity_ || closed_ || killed_; });
-    EnforceNotKilled();
+    if (killed_) {
+      VLOG(3)
+          << "WARNING:: Sending an element to a killed reader::BlokcingQueue";
+      return false;
+    }
     if (closed_) {
       VLOG(5)
           << "WARNING: Sending an element to a closed reader::BlokcingQueue.";
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index b29493404f4536c41a44763c14e4cf84b4d992df..f5d55791d86c68bf800b869ee2be981bd6ab63b5 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -53,9 +53,25 @@ BufferedReader::BufferedReader(
     stream_ = platform::CudaStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (platform::is_npu_place(place_)) {
+    int dev_idx = BOOST_GET_CONST(platform::NPUPlace, place_).device;
+    compute_stream_ =
+        ((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::NpuEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
   is_same_place_ = false;
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
+  npu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -196,7 +212,59 @@ void BufferedReader::ReadAsync(size_t i) {
 #endif
       }
     }
-#endif  // @} End Group GPU Place
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(place_)) {
+      TensorVec &npu = npu_buffer_[i];
+      if (npu.empty()) {
+        npu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            npu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on NPU and CPU devices are not matched. "
+                "The number on NPU is %d, on CPU is %d",
+                npu.size(), cpu.size()));
+      }
+
+      std::vector<void *> npu_ptrs;
+      npu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        npu[i].Resize(cpu[i].dims());
+        npu[i].set_layout(cpu[i].layout());
+        npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::SetNPUDeviceId(
+          BOOST_GET_CONST(platform::NPUPlace, place_).device);
+      PADDLE_ENFORCE_NPU_SUCCESS(
+          aclrtRecordEvent(events_[i].get(), compute_stream_));
+      PADDLE_ENFORCE_NPU_SUCCESS(
+          aclrtStreamWaitEvent(stream_.get(), events_[i].get()));
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data<void>();
+        auto npu_ptr = npu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
+        if ((platform::is_npu_place(cpu_place))) {
+          memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
+                       BOOST_GET_CONST(platform::NPUPlace, cpu_place), cpu_ptr,
+                       size, stream_.get());
+        } else {
+          memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
+                       BOOST_GET_CONST(platform::CPUPlace, cpu_place), cpu_ptr,
+                       size, stream_.get());
+          PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get()));
+        }
+        npu[i].set_lod(cpu[i].lod());
+      }
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get()));
+    }
+#endif
     return i;
   }));
 }
@@ -228,9 +296,13 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     return;
   }
 
-  *out = std::move((platform::is_gpu_place(place_) && !is_same_place_)
-                       ? cuda_buffer_[i]
-                       : cpu_buffer_[i]);
+  if (platform::is_gpu_place(place_) && !is_same_place_) {
+    *out = std::move(cuda_buffer_[i]);
+  } else if (platform::is_npu_place(place_) && !is_same_place_) {
+    *out = std::move(npu_buffer_[i]);
+  } else {
+    *out = std::move(cpu_buffer_[i]);
+  }
 
   // Do not push current position into ReadAsync. Push the previous position
   // Since all computation in fluid are async, change the data of
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index fbc46aceb81305ab3c46ef8c62e65f5a5a62383c..9f7b0e753281eb2e6476bc931b454b3b15340c3c 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -25,7 +25,10 @@
 #include "paddle/fluid/platform/cuda_resource_pool.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
-
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/npu_resource_pool.h"
+#endif
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -67,12 +70,19 @@ class BufferedReader : public framework::DecoratedReader {
   bool is_same_place_;
   std::vector<TensorVec> cpu_buffer_;
   std::vector<TensorVec> cuda_buffer_;
+  std::vector<TensorVec> npu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
   std::shared_ptr<platform::CudaStreamObject> stream_;
   std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  aclrtStream compute_stream_;
+  std::shared_ptr<platform::NpuStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index c04bdb2f10930e2bfbe28e4fee75b7bc51676bcb..a7d177f326e511bdda1cdfbcfa5c108dbeda7458 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/operators/reader/py_reader.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 9086291e17db8912b377ba4fac2efe6c099ef705..38894495b4ca05bbf2acd3b49fc59de1d0854809 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 98a68ca69cafd078aecde322e379ca3103680dd4..1aa93c80387e650e07cbccea258c3046175b90c1 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -68,7 +68,7 @@ TEST(BlockingQueue, SenderBlockingTest) {
       ++send_count;
     }
   });
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  std::this_thread::sleep_for(std::chrono::milliseconds(1500));
   q.Close();
   sender.join();
   EXPECT_EQ(send_count, queue_cap);
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 9766008963be00b74b0264fe688c7df84bb23e5c..92e5e4a0cd120fcedeed9eca1c32f88635f86adc 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -210,9 +210,10 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
   auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
   auto *program = block->Program();
-  auto ctx = executor.Prepare(
-      *program, block->ID(), Attr<std::vector<std::string>>(
-                                 kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/);
+  auto ctx = executor.Prepare(*program, block->ID(),
+                              Attr<std::vector<std::string>>(
+                                  kSkipEagerDeletionVars), /*skip_ref_cnt_vars*/
+                              true);
 
   static std::mutex mutex;
   std::lock_guard<std::mutex> lock(mutex);
@@ -255,16 +256,6 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
     // Link inside::output -> outside::output
     //   outside::output[seq_offset: seq_offset + 1] = inside::output
     executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_);
-    if (i > 0) {
-      LinkTensorWithCallback(scope, Outputs(kOutputs), cur_scope,
-                             Outputs(kOutputs),
-                             [&](const framework::LoDTensor &src_tensor,
-                                 framework::LoDTensor *dst_tensor) {
-                               framework::Tensor src_slice =
-                                   src_tensor.Slice(seq_offset, seq_offset + 1);
-                               dst_tensor->ShareDataWith(src_slice);
-                             });
-    }
 
     // Linked now, execute!
     executor.RunPreparedContext(ctx.get(), &cur_scope,
@@ -284,6 +275,14 @@ void RecurrentOp::RunImpl(const framework::Scope &scope,
             // early.
             framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
           });
+    } else {
+      LinkTensorWithCallback(
+          cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
+          [&](const framework::LoDTensor &src_tensor,
+              framework::LoDTensor *dst_tensor) {
+            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
+            framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out);
+          });
     }
 
     scopes.ForwardNext();
diff --git a/paddle/fluid/operators/reduce_ops/CMakeLists.txt b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
index 92107c9dc442ee53fd09f5d51a0d660049dc16f0..846d362fb522db730315dbdc5063a1ffcc035548 100644
--- a/paddle/fluid/operators/reduce_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/reduce_ops/CMakeLists.txt
@@ -42,3 +42,7 @@ endif()
 if(WITH_ROCM)
     hip_test(check_reduce_rank_test SRCS check_reduce_rank_test.cu DEPS tensor)
 endif()
+
+if(WITH_ASCEND_CL)
+    cc_test(reduce_any_op_npu_test SRCS reduce_any_op_npu_test.cc DEPS op_registry reduce_any_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/reduce_ops/cub_reduce.h b/paddle/fluid/operators/reduce_ops/cub_reduce.h
index 39cce60faf3d75cc137206584135de5935ad6982..29e46e091d06858378cb31a1005ec5687797e583 100644
--- a/paddle/fluid/operators/reduce_ops/cub_reduce.h
+++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h
@@ -161,7 +161,11 @@ static inline std::vector<int> GetStrides(const std::vector<int>& dims,
   return strides;
 }
 
+#ifdef __HIPCC__
+constexpr int kMaxBlockDim = 256;
+#else
 constexpr int kMaxBlockDim = 512;
+#endif
 
 static inline int GetDesiredBlockDim(int block_dim) {
   return block_dim >= kMaxBlockDim
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b18c16c8c71f7a98e1f65079031cbecc947d0344
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_max_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMaxMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_max);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_max, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMaxMKLDNNKernel<float>,
+                   ops::ReduceMaxMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dfba933940bd0209c3a1754fbdcf830ba8dd55c7
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMeanMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_mean);
+  }
+};
+
+template <typename T>
+class ReduceMeanGradMKLDNNKernel : public ReduceGradMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* input_x = ctx.Input<Tensor>("X");
+    auto input_dims = framework::vectorize(input_x->dims());
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+
+    int number_of_elements = 1;
+    if (!ctx.Attr<bool>("reduce_all")) {
+      for (size_t i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = (reduce_dims[i] >= 0)
+                             ? reduce_dims[i]
+                             : input_dims.size() + reduce_dims[i];
+        number_of_elements *= input_dims[reduce_dims[i]];
+      }
+    } else {
+      number_of_elements = input_x->numel();
+    }
+
+    this->RunKernel(ctx, dnnl::algorithm::binary_add,
+                    dnnl::algorithm::reduction_mean, 0.0f,
+                    1.0L / number_of_elements);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_mean, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMeanMKLDNNKernel<float>,
+                   ops::ReduceMeanMKLDNNKernel<paddle::platform::bfloat16>);
+
+REGISTER_OP_KERNEL(reduce_mean_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMeanGradMKLDNNKernel<float>,
+                   ops::ReduceMeanGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce63a1485471f714ab4a9266f4a37843c3810a1f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_min_mkldnn_op.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMinMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_min);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_min, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMinMKLDNNKernel<float>,
+                   ops::ReduceMinMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..40cd3ba974f04c0196101f432cf8d51f2b00ce34
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -0,0 +1,215 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::LoDTensor;
+using paddle::framework::Tensor;
+using platform::to_void_cast;
+
+inline std::vector<int64_t> CalculateReducedDims(const Tensor* input,
+                                                 const Tensor* output,
+                                                 std::vector<int>& reduce_dims,
+                                                 bool reduce_all,
+                                                 bool keep_dim) {
+  if (keep_dim) return framework::vectorize(output->dims());
+
+  if (reduce_all)
+    return std::vector<int64_t>(framework::vectorize(input->dims()).size(), 1);
+
+  std::vector<int64_t> output_dims(framework::vectorize(input->dims()));
+  for (size_t i = 0; i < reduce_dims.size(); ++i) {
+    reduce_dims[i] = (reduce_dims[i] >= 0)
+                         ? reduce_dims[i]
+                         : input->dims().size() + reduce_dims[i];
+    output_dims[reduce_dims[i]] = 1;
+  }
+
+  return output_dims;
+}
+
+template <typename T>
+class ReduceMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void RunKernel(const framework::ExecutionContext& ctx,
+                 dnnl::algorithm reduction_type) const {
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    const auto* input = ctx.Input<LoDTensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+
+    auto output_dims =
+        CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim);
+    auto input_dims = framework::vectorize(input->dims());
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    // oneDNN reduce op does not support edge case in which memory is being
+    // copied without actual reduction.
+    // In that case reorder must be executed to maintain compatibility with
+    // PaddlePaddle reduce op
+    if (input_dims == output_dims) {
+      mkldnn::memory::data_type input_type =
+          framework::ToMKLDNNDataType(input->type());
+      std::string key = platform::CreateKey(
+          dev_ctx, input_dims, input->format(), input->format(), input_type);
+      platform::ReorderMKLDNNHandler reorder_handler(
+          input_dims, input->type(), input_type, dev_ctx, onednn_engine, key);
+
+      auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+          input->format(), platform::to_void_cast(input->data<T>()));
+
+      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+          output, input->format(), ctx.GetPlace());
+
+      auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
+                                                      reorder_dst_memory_p);
+
+      platform::RecordEvent record_reorder("int_reorder",
+                                           platform::EventRole::kUniqueOp);
+
+      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+      astream.wait();
+
+      output->set_layout(framework::DataLayout::kMKLDNN);
+      output->set_format(
+          platform::GetMKLDNNFormat(reorder_dst_memory_p->get_desc().reshape(
+              paddle::framework::vectorize<int64_t>(output->dims()))));
+    } else {
+      platform::ReductionMKLDNNHandler<T> handler(
+          reduction_type, 0.0f, 0.0f, dev_ctx, onednn_engine, ctx.GetPlace(),
+          input, output, ctx.InputName("X"), output_dims);
+
+      auto src_memory_p = handler.AcquireSrcMemory(input);
+      auto dst_memory_p = handler.AcquireDstMemory(output);
+
+      std::unordered_map<int, dnnl::memory> reduction_args = {
+          {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
+
+      auto reduction_p = handler.AcquireForwardPrimitive();
+
+      reduction_p->execute(astream, reduction_args);
+      astream.wait();
+      output->set_layout(framework::DataLayout::kMKLDNN);
+      output->set_format(
+          platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape(
+              paddle::framework::vectorize<int64_t>(output->dims()))));
+    }
+  }
+};
+
+template <typename T>
+class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void RunKernel(const framework::ExecutionContext& ctx,
+                 dnnl::algorithm binary_type, dnnl::algorithm reduction_type,
+                 float scale_x, float scale_y) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+    auto* input_dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output_dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    mkldnn::memory::format_tag x_format_tag;
+    auto input_dims =
+        CalculateReducedDims(output_dx, input_dy, dims, reduce_all, keep_dim);
+
+    if (input_dims != framework::vectorize(output_dx->dims())) {
+      const std::string key_pd =
+          platform::CreateKey(
+              dev_ctx, framework::vectorize(output_dx->dims()),
+              ctx.InputName("X"),
+              (std::to_string(static_cast<int>(reduction_type)))) +
+          "@fwd_pd";
+      std::shared_ptr<dnnl::reduction::primitive_desc> fwd_pd =
+          std::static_pointer_cast<dnnl::reduction::primitive_desc>(
+              dev_ctx.GetBlob(key_pd));
+
+      PADDLE_ENFORCE_NOT_NULL(
+          fwd_pd, platform::errors::Unavailable(
+                      "Forward primitive descriptor is not available in %s op, "
+                      "cannot deduce memory format tag",
+                      ctx.Type()));
+
+      x_format_tag = platform::GetMKLDNNFormat(fwd_pd->src_desc());
+
+      PADDLE_ENFORCE_NE(x_format_tag, mkldnn::memory::format_tag::undef,
+                        platform::errors::InvalidArgument(
+                            "Cannot deduce format tag for %s op", ctx.Type()));
+    } else {  // fwd descriptor not available because reorder was used instead
+              // of reduction
+      x_format_tag = getPlainFormatTag(output_dx);
+    }
+
+    output_dx->mutable_data<T>(ctx.GetPlace());
+    output_dx->set_format(x_format_tag);
+    output_dx->set_layout(input_dy->layout());
+
+    platform::BroadcastDataMKLDNNHandler<T> handler(
+        binary_type, dev_ctx, onednn_engine, ctx.GetPlace(), output_dx,
+        input_dy, scale_x, scale_y,
+        ctx.InputName(framework::GradVarName("Out")), input_dims);
+
+    const auto src_dx_memory = handler.AcquireSrcMemory(output_dx);
+    const auto src_dy_memory = handler.AcquireSecondSrcMemory(input_dy);
+    const auto binary_prim = handler.AcquireForwardPrimitive();
+
+    const std::unordered_map<int, dnnl::memory> args = {
+        {DNNL_ARG_SRC_0, *src_dx_memory},
+        {DNNL_ARG_SRC_1, *src_dy_memory},
+        {DNNL_ARG_DST, *src_dx_memory}};
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    binary_prim->execute(astream, args);
+    astream.wait();
+  }
+
+ protected:
+  mkldnn::memory::format_tag getPlainFormatTag(const Tensor* tensor) const {
+    auto tensor_dims_size = tensor->dims().size();
+    PADDLE_ENFORCE_EQ(
+        tensor_dims_size <= 5 && tensor_dims_size >= 1, true,
+        platform::errors::InvalidArgument(
+            "Dims for reduction_grad oneDNN op must be in range <1, 5>"));
+
+    switch (tensor_dims_size) {
+      case 1:
+        return mkldnn::memory::format_tag::a;
+      case 2:
+        return mkldnn::memory::format_tag::ab;
+      case 3:
+        return mkldnn::memory::format_tag::abc;
+      case 4:
+        return mkldnn::memory::format_tag::abcd;
+    }
+
+    return mkldnn::memory::format_tag::abcde;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f92d39ede1ae8cbc564e9e68f54c72c0160f75c
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceSumMKLDNNKernel : public ReduceMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::reduction_sum);
+  }
+};
+
+template <typename T>
+class ReduceSumGradMKLDNNKernel : public ReduceGradMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::binary_add,
+                    dnnl::algorithm::reduction_sum, 0.0f, 1.0f);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(reduce_sum, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceSumMKLDNNKernel<float>,
+                   ops::ReduceSumMKLDNNKernel<paddle::platform::bfloat16>);
+
+REGISTER_OP_KERNEL(reduce_sum_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceSumGradMKLDNNKernel<float>,
+                   ops::ReduceSumGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..39e74c908ae7ab5c420f07a559804d5aa5a9c216
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ReduceAnyNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    // set attr
+    NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
+
+    auto runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(reduce_any, ops::ReduceAnyNPUKernel<bool>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1eeeb5e1f8aa19dd1de149a8e5225fd68c248f34
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+using Tensor = paddle::framework::Tensor;
+
+USE_OP(reduce_any);
+USE_OP_DEVICE_KERNEL(reduce_any, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  std::vector<bool> init_x = {true, false, false, false};
+  f::TensorFromVector<bool>(init_x, ctx, tensor_x);
+  tensor_x->Resize(paddle::framework::make_ddim({2}));
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // run
+  std::vector<int> axes;
+  f::AttributeMap attrs = {{"axes", axes}, {"keep_dims", true}};
+  auto op = f::OpRegistry::CreateOp("reduce_any", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  ctx.Wait();
+
+  std::vector<bool> out_vec;
+  f::TensorToVector<bool>(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  std::vector<bool> expected_vec = {true};
+  EXPECT_EQ(out_vec.size(), expected_vec.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], expected_vec[i]);
+  }
+}
+
+TEST(reduce_any, NPU) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<bool>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 25f9453571ac632e70b0755ca1e5566eb5bf6ee6..390c4d9709a60f1400273062d5da52155e100853 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -489,6 +489,30 @@ class ReduceOp : public framework::OperatorWithKernel {
       }
     }
   }
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // choose cudnn kernel if the runtime supported.
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+    if (ctx.Input<paddle::framework::LoDTensor>("X")->dims().size() > 5)
+      return framework::OpKernelType(input_data_type, ctx.GetPlace());
+
+#ifdef PADDLE_WITH_MKLDNN
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+
+    if (input_data_type == framework::proto::VarType::FP16) {
+      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
+                        platform::errors::InvalidArgument(
+                            "float16 can only be used on GPU place"));
+    }
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 
 class ReduceOpUseInputPlace : public ReduceOp {
@@ -536,14 +560,28 @@ class ReduceGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     int in_dtype = ctx.Attr<int>("in_dtype");
-    if (in_dtype >= 0) {
-      return framework::OpKernelType(
-          static_cast<framework::proto::VarType::Type>(in_dtype),
-          ctx.GetPlace());
+    auto input_data_type =
+        (in_dtype >= 0) ? static_cast<framework::proto::VarType::Type>(in_dtype)
+                        : OperatorWithKernel::IndicateVarDataType(
+                              ctx, framework::GradVarName("Out"));
+
+#ifdef PADDLE_WITH_MKLDNN
+    auto CanMKLDNNReduceGradBeUsed = [&]() {
+      auto dx_dims = ctx.Input<Tensor>("X")->dims();
+
+      if (dx_dims.size() > 5) return false;  // max 5D tensor is supported
+
+      return true;
+    };
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
+        CanMKLDNNReduceGradBeUsed()) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
     }
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.GetPlace());
+#endif
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
 
@@ -579,6 +617,9 @@ class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
         "(int, default -1)"
         "The dtype of output, default value is -1, the dtype is same as intput")
         .SetDefault(-1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 %s Operator.
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f3b6e69a48bcb05563bc141e59863f95d6c17e30
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ReduceSumNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dims = ctx.Attr<bool>("keep_dim");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    // special case
+    if (x->dims().size() == 1 && keep_dims == false) {
+      keep_dims = true;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    framework::Tensor cast_x;
+    framework::Tensor cast_out;
+    // NOTE: ReduceSumD only supports fp32 and fp16
+    if (x->type() != framework::proto::VarType::FP32 &&
+        x->type() != framework::proto::VarType::FP16) {
+      cast_x.Resize(x->dims());
+      cast_x.mutable_data<float>(ctx.GetPlace());
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
+      auto runner_cast = NpuOpRunner(
+          "Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast.Run(stream);
+
+      cast_out.Resize(out->dims());
+      cast_out.mutable_data<float>(ctx.GetPlace());
+    } else {
+      cast_x.ShareDataWith(*x);
+      cast_out.ShareDataWith(*out);
+    }
+
+    if (reduce_all) {
+      std::vector<int> dim_vec;
+      for (int i = 0; i < x->dims().size(); i++) {
+        dim_vec.push_back(i);
+      }
+
+      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
+                                {{"axes", dim_vec}, {"keep_dims", keep_dims}});
+      runner.Run(stream);
+
+    } else {
+      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
+                                {{"axes", dims}, {"keep_dims", keep_dims}});
+      runner.Run(stream);
+    }
+
+    if (x->type() != framework::proto::VarType::FP32 &&
+        x->type() != framework::proto::VarType::FP16) {
+      auto dst_dtype = ConvertToNpuDtype(out->type());
+      auto runner_cast =
+          NpuOpRunner("Cast", {cast_out}, {*out},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast.Run(stream);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    bool keep_dims = ctx.Attr<bool>("keep_dim");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    if (keep_dims || reduce_all) {
+      auto runner = NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad},
+                                {{"shape", framework::vectorize(x->dims())}});
+      runner.Run(stream);
+    } else {
+      framework::DDim out_dims;
+      out_dims = UnsqueezeKernel<DeviceContext, T>::GetOutputShape(
+          dims, out_grad->dims());
+
+      Tensor out_grad_tmp(out_grad->type());
+      out_grad_tmp.Resize(out_dims);
+      out_grad_tmp.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *out_grad, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(),
+          &out_grad_tmp);
+      out_grad_tmp.Resize(out_dims);
+
+      auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
+                                {{"shape", framework::vectorize(x->dims())}});
+      runner.Run(stream);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    reduce_sum,
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ReduceSumNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    reduce_sum_grad,
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ReduceSumGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                paddle::platform::float16>);
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 94efa70e467bca12d66ab65b0f128bea7bb4cfc7..e119a21caa23cb937894031a3abec7c33b843615 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -377,31 +377,9 @@ class ReshapeKernel {
 
     out->Resize(out_dims);
     out->mutable_data(ctx.GetPlace(), in->type());
-
-#ifdef PADDLE_WITH_XPU
-    if (platform::is_xpu_place(ctx.GetPlace())) {
-      void *out_ptr = out->data<void>();
-      const void *in_ptr = in->data<void>();
-      if ((out_ptr != nullptr) && (in_ptr != nullptr) &&
-          (paddle::framework::SizeOfType(in->type()) > 0)) {
-        auto &dev_ctx =
-            ctx.template device_context<paddle::platform::XPUDeviceContext>();
-        int r = xpu::memcpy_device(
-            dev_ctx.x_context(), out_ptr, in_ptr,
-            in->numel() * paddle::framework::SizeOfType(in->type()));
-        PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
-                          platform::errors::External(
-                              "XPU memcpy_device return wrong value[%d %s]", r,
-                              XPUAPIErrorMsg[r]));
-      }
-    } else {
-#endif
-      framework::TensorCopy(
-          *in, ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(), out);
-#ifdef PADDLE_WITH_XPU
-    }
-#endif
+    framework::TensorCopy(
+        *in, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), out);
     out->Resize(out_dims);
   }
 };
diff --git a/paddle/fluid/operators/reshape_op_npu.cc b/paddle/fluid/operators/reshape_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79a4cd116f3b939cdbf11992a6386b196b0d77ff
--- /dev/null
+++ b/paddle/fluid/operators/reshape_op_npu.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class Reshape2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto list_new_shape_tensor =
+        ctx.MultiInput<framework::Tensor>("ShapeTensor");
+    if (list_new_shape_tensor.size() > 0) {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Input(ShapeTensor) is not supported on NPU."));
+    }
+    PADDLE_ENFORCE_EQ(ctx.Input<framework::LoDTensor>("Shape"), nullptr,
+                      platform::errors::Unimplemented(
+                          "Input(Shape) is not supported on NPU."));
+    auto shape = out->dims();
+    out->mutable_data(ctx.GetPlace(), x->type());
+    framework::TensorCopy(
+        *x, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), out);
+    out->Resize(shape);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Reshape2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto in_dims = d_x->dims();
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    framework::TensorCopy(
+        *d_out, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), d_x);
+    d_x->Resize(in_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    reshape2, ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::Reshape2NPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    reshape2_grad,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::Reshape2GradNPUKernel<paddle::platform::NPUDeviceContext,
+                               paddle::platform::float16>);
diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc
index ccf619a074ae28d6e4c4110267fa95f38b360a88..2be59c620441d6b3674b02373acc44e54751a50e 100644
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@@ -117,10 +117,11 @@ class RNNDescriptors {
 
 // ------------------- cudnn rnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor(
-        rnn_desc_.desc(), hidden_size_, num_layers_, miopenRNNlinear,
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        miopenRNNlinear,
         is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, mode_,
-        miopenRNNNoBias, miopenRNNdefault, cudnn_type));
+        miopenRNNwithBias, miopenRNNdefault, cudnn_type));
 #elif CUDNN_VERSION >= 6000
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
         handle, rnn_desc_.desc(), hidden_size_, num_layers_,
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fb82d18e62f3bfd8b100c7110b50c10ebe74ba30
--- /dev/null
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -0,0 +1,314 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+using TensorList = std::vector<framework::Tensor>;
+
+template <typename TensorType, typename T>
+void reset_parameter_vector(const std::vector<TensorType>& raw_params_vec,
+                            const int& num_layers, const bool& is_bidirec,
+                            std::vector<std::vector<T*>>* params_vec) {
+  // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
+  // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
+  // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
+  const int& direction_num = is_bidirec ? 2 : 1;
+  const int& layer_weight_size = 4 * direction_num;
+  const int& all_weight_size = num_layers * layer_weight_size;
+  const int& bias_start_idx = all_weight_size / 2;
+  for (int i = 0; i < num_layers; i++) {
+    params_vec->at(i).resize(layer_weight_size);
+    for (int j = 0; j < layer_weight_size; j++) {
+      int k = j % 4;
+      const int& section = j / 4;
+      int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
+      if (k >= 2) {
+        tensor_idx += bias_start_idx;
+      }
+      using remove_cv_t = typename std::remove_cv<T>::type;
+      params_vec->at(i)[j] =
+          raw_params_vec[tensor_idx]->template data<remove_cv_t>();
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class RnnXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto state = ctx.MultiOutput<Tensor>("State");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* reserve_data = ctx.Output<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+    auto last_h = state[0];
+    auto last_c = state[1];
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    // init the output and allocate the memory
+    output->mutable_data<T>(ctx.GetPlace());
+    last_h->mutable_data<T>(ctx.GetPlace());
+    last_c->mutable_data<T>(ctx.GetPlace());
+    reserve_data->Resize({seq_len * batch_size * hidden_size * 5});
+    reserve_data->mutable_data<T>(ctx.GetPlace());
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto b_x = parameter_lists[0][2];
+    auto b_h = parameter_lists[0][3];
+    auto y = output->data<T>();
+    auto last_h_ptr = last_h->data<T>();
+    auto last_c_ptr = last_c->data<T>();
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    // run kernel
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_train<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h,
+        reinterpret_cast<T*>(y), reinterpret_cast<T*>(last_h_ptr),
+        reinterpret_cast<T*>(last_c_ptr), batch_size, input_dim, hidden_size,
+        seq_len, seq_len_tensor, nullptr, nullptr, nullptr, nullptr,
+        reinterpret_cast<T*>(i_f_g_o), reinterpret_cast<T*>(c));
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External("RnnXPU(lstm) return wrong "
+                                                 "value[%d %s]",
+                                                 r, XPUAPIErrorMsg[r]));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RnnXPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // get the tensor pointer for the input
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto* output = ctx.Input<Tensor>("Out");
+    auto* reserve_data = ctx.Input<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto state_grad = ctx.MultiInput<Tensor>(framework::GradVarName("State"));
+    auto last_h_grad = state_grad[0];
+    auto last_c_grad = state_grad[1];
+
+    // get the tensor pointer for the output
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
+        framework::GradVarName("WeightList"));
+    auto pre_state_grad =
+        ctx.MultiOutput<Tensor>(framework::GradVarName("PreState"));
+    Tensor* init_h_grad = nullptr;
+    Tensor* init_c_grad = nullptr;
+    if (pre_state_grad.size() > 0) {  // has gradient
+      init_h_grad = pre_state_grad[0];
+      init_c_grad = pre_state_grad[1];
+    }
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    for (unsigned int i = 0; i < weight_grad_list.size(); ++i) {
+      weight_grad_list[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    std::vector<std::vector<T*>> parameter_lists_grad;
+    parameter_lists_grad.resize(num_layers);
+    reset_parameter_vector(weight_grad_list, num_layers, is_bidirec,
+                           &parameter_lists_grad);
+
+    // allocate the memory and initization the input_grad
+    input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
+    if (init_h_grad) {
+      init_h_grad->mutable_data<T>(init_h->dims(), ctx.GetPlace());
+    }
+    if (init_c_grad) {
+      init_c_grad->mutable_data<T>(init_c->dims(), ctx.GetPlace());
+    }
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto y = output->data<T>();
+    auto y_grad = output_grad->data<T>();
+    auto last_h_grad_ptr = last_h_grad->data<T>();
+    auto last_c_grad_ptr = last_c_grad->data<T>();
+    auto x_grad = input_grad->data<T>();
+    auto h_0_grad = init_h_grad ? init_h_grad->data<T>() : nullptr;
+    auto c_0_grad = init_c_grad ? init_c_grad->data<T>() : nullptr;
+    auto w_x_grad = parameter_lists_grad[0][0];
+    auto w_h_grad = parameter_lists_grad[0][1];
+    auto b_x_grad = parameter_lists_grad[0][2];
+    auto b_h_grad = parameter_lists_grad[0][3];
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_grad<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)y, (const T*)y_grad,
+        (const T*)last_h_grad_ptr, (const T*)last_c_grad_ptr,
+        reinterpret_cast<T*>(x_grad), reinterpret_cast<T*>(h_0_grad),
+        reinterpret_cast<T*>(c_0_grad), w_x_grad, w_h_grad, b_x_grad, b_h_grad,
+        batch_size, input_dim, hidden_size, seq_len, seq_len_tensor, nullptr,
+        nullptr, nullptr, nullptr, i_f_g_o, c);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("RnnXPUGrad(lstm) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    rnn, ops::RnnXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    rnn_grad, ops::RnnXPUGradKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h
index 0246c42d433255ebb35f259b78cab1cce2118475..939768693a2431a47dbba24e83603ae53c9707e9 100644
--- a/paddle/fluid/operators/save_combine_op.h
+++ b/paddle/fluid/operators/save_combine_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1fb136a5110dbd512f8d3da323873a0e68d6edfc
--- /dev/null
+++ b/paddle/fluid/operators/save_combine_op_npu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index f619f3d59cece50ea39666170fe57479334457b5..194274cdd5bb4d59188e171866f685b127cb1369 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -88,6 +88,8 @@ REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker,
 REGISTER_OP_CPU_KERNEL(
     save, ops::SaveOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::CPUDeviceContext,
+                      paddle::platform::float16>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::SaveOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
index fbde722a425bc3ad39d7070d6ba399f04bd7a746..e44a5c77bd8410dbff20c6a7597b4168ae6d96e5 100644
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
@@ -19,7 +19,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/data_type_transform.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90db1a0bb85d604cfc8cd47f13380949d21508f5
--- /dev/null
+++ b/paddle/fluid/operators/save_op_npu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 281689d3bdaff29c84e5f7aa83a3ff85f110c3ac..a9b1f299dab82791e6a98afb2b75d65b1703a5a2 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -128,6 +128,8 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
 REGISTER_OP_CPU_KERNEL(
     scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::bfloat16>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, uint8_t>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, int16_t>,
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cbfd11834ae47710bc8b80df15400689a50af6bc
--- /dev/null
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/scale_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ScaleNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto scale = static_cast<float>(ctx.Attr<float>("scale"));
+    auto bias = static_cast<float>(ctx.Attr<float>("bias"));
+    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    float _power = 1.0;
+    VLOG(4) << "scale:" << scale << ", bias:" << bias
+            << " ,bias_after_scale:" << bias_after_scale;
+    if (bias_after_scale) {
+      out->mutable_data<T>(ctx.GetPlace());
+      auto runner =
+          NpuOpRunner("Power", {*x}, {*out},
+                      {{"power", _power}, {"scale", scale}, {"shift", bias}});
+
+      runner.Run(stream);
+    } else {
+      Tensor tmp_x(x->type());
+      tmp_x.Resize(x->dims());
+      tmp_x.mutable_data<T>(ctx.GetPlace());
+      auto runner_tmp = NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}});
+      runner_tmp.Run(stream);
+
+      out->mutable_data<T>(ctx.GetPlace());
+      float _bias = 0.0;
+      auto runner =
+          NpuOpRunner("Power", {tmp_x}, {*out},
+                      {{"power", _power}, {"scale", scale}, {"shift", _bias}});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    scale, ops::ScaleNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ScaleNPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h
index cfa88b9808d646711fc49c42f4cba50bf013332c..864a94a4235e65d67b960f444bb86a48c3af8159 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
@@ -102,9 +102,13 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
 
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i],
-                      platform::errors::InvalidArgument(
-                          "src shape and dst shape should match"));
+    PADDLE_ENFORCE_EQ(
+        src_dims[i], dst_dims[i],
+        platform::errors::InvalidArgument(
+            "The dimensions of the source tensor and target tensor should"
+            " match, but received source tensor's %d-th dimension is %d,"
+            "target tensor's %d-th dimension is %d.",
+            i, src_dims[i], i, dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -146,9 +150,13 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
 
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i],
-                      platform::errors::InvalidArgument(
-                          "src shape and dst shape should match"));
+    PADDLE_ENFORCE_EQ(
+        src_dims[i], dst_dims[i],
+        platform::errors::InvalidArgument(
+            "The dimensions of the source tensor and target tensor should"
+            " match, but received source tensor's %d-th dimension is %d,"
+            "target tensor's %d-th dimension is %d.",
+            i, src_dims[i], i, dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index 144e7ceae20c1665e5ebd80a8e090f14faf70321..2d23e81717abb8653dff0ec78942676339dc52e7 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -50,10 +50,15 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LE(
         index_dims[index_dims_size - 1], ref_dims_size,
         platform::errors::InvalidArgument(
-            "Input(Index).shape[-1] should be no greater than Input(X).rank"));
+            "The last dimension of Input(Index)'s shape should be no greater "
+            "than the rank of Input(X), but received the last dimension of "
+            "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
+            index_dims[index_dims_size - 1], ref_dims_size));
     PADDLE_ENFORCE_GE(index_dims_size, 2UL,
                       platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1"));
+                          "The rank of Input(Index) should be greater than 1, "
+                          "but received the rank of Input(Index) is %d.",
+                          index_dims_size));
 
     // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
     std::vector<int64_t> r_updates_dims;
@@ -66,12 +71,21 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(
         r_updates_dims.size(), updates_dims_size,
-        platform::errors::InvalidArgument("Updates has wrong shape"));
+        platform::errors::InvalidArgument(
+            "Updates has wrong shape. The shape of Updates and Input(Updates) "
+            "should be same, but received the shape of Updates is %d, "
+            "the shape of Input(Updates) is %d.",
+            r_updates_dims.size(), updates_dims_size));
 
     for (int64_t i = 0; i < updates_dims_size; ++i) {
       PADDLE_ENFORCE_EQ(
           r_updates_dims[i], updates_dims[i],
-          platform::errors::InvalidArgument("Updates has wrong shape"));
+          platform::errors::InvalidArgument(
+              "Updates has wrong shape. The dimensions of Updates and "
+              "Input(Updates) should match, but received Updates's"
+              "%d-th dimension is %d, Input(Updates)'s %d-th "
+              "dimension is %d.",
+              i, r_updates_dims[i], i, updates_dims[i]));
     }
     ctx->SetOutputDim("Out", ref_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 3fc40d41c308177b6bd6f4355f23870d38257fd4..f0faa0c57983393fbfed4dc5e21b2a9742f51668 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -41,15 +41,24 @@ class ScatterOp : public framework::OperatorWithKernel {
     auto ref_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(
         ctx->GetInputDim("Ids").size(), 1,
-        platform::errors::InvalidArgument("Update Ids should be 1-D."));
+        platform::errors::InvalidArgument(
+            "The size of Input(Ids)'s shape should be equal to 1, but "
+            "received the rank of Input(Ids) is %d.",
+            ctx->GetInputDim("Ids").size()));
     PADDLE_ENFORCE_EQ(
         ref_dims.size(), updates_dims.size(),
         platform::errors::InvalidArgument(
-            "Rerence and Updates should have the same shape size."));
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Ids")[0],
-                      platform::errors::InvalidArgument(
-                          "Updates and Ids should have same batch-size."));
+            "Input(X) and Input(Updates) should have the same shape size, "
+            "but received the size of Input(x)'s shape is %d, the size of "
+            "Input(Updates)'s shape is %d.",
+            ref_dims.size(), updates_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0],
+        platform::errors::InvalidArgument(
+            "Input(Updates) and Input(Ids) should have same batch-size, but"
+            " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
+            "batch-size is %d.",
+            ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0]));
     ctx->SetOutputDim("Out", ref_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2e49acb94c7b22120acbd614c2f0ac139540f3c
--- /dev/null
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/kron_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/scatter_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ScatterNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* index = ctx.Input<Tensor>("Ids");
+    auto* updates = ctx.Input<Tensor>("Updates");
+    bool overwrite = ctx.Attr<bool>("overwrite");
+
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    framework::Tensor tmp_tensor(index->type());
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 1) {
+      tmp_tensor.ShareDataWith(*index);
+      std::vector<int64_t> new_dim = {index_dims[0], 1};
+      tmp_tensor.Resize(framework::make_ddim(new_dim));
+      index = &tmp_tensor;
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (overwrite) {
+      auto runner_update = NpuOpRunner("TensorScatterUpdate",
+                                       {*x, *index, *updates}, {*out}, {});
+      runner_update.Run(stream);
+    } else {
+      auto runner_add =
+          NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
+      runner_add.Run(stream);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    scatter, ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ScatterNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 94d34c648d17482627bec08d1a9038a8600106a8..96a132ac6abc21bef20f0c688f72c03ffebe47bd 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -124,6 +124,9 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<int64_t>>(
         "steps", "(list<int64_t>) Stride step from the start to the end.")
         .SetDefault({});
+    AddAttr<std::vector<int64_t>>("decrease_axes",
+                                  "(list<int>) The axes to decrease.")
+        .SetDefault({});
 
     AddAttr<std::vector<int>>("bool_values", "Store the bool values.")
         .SetDefault({});
@@ -143,22 +146,75 @@ Assignment to a Tensor in static mode.
 )DOC");
   }
 };
+
+template <typename T>
+class SetValueGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    if (this->HasInput("ValueTensor")) {
+      op->SetType("slice");
+      op->SetInput("Input", this->OutputGrad("Out"));
+      if (this->HasInput("StartsTensorList")) {
+        op->SetInput("StartsTensorList", this->Input("StartsTensorList"));
+      }
+      if (this->HasInput("EndsTensorList")) {
+        op->SetInput("EndsTensorList", this->Input("EndsTensorList"));
+      }
+
+      // convert std::vector<int64_t > to std::vector<int >
+      std::vector<int64_t> axes_int64 = static_cast<std::vector<int64_t>>(
+          BOOST_GET_CONST(std::vector<int64_t>, this->GetAttr("axes")));
+      std::vector<int64_t> starts_int64 = static_cast<std::vector<int64_t>>(
+          BOOST_GET_CONST(std::vector<int64_t>, this->GetAttr("starts")));
+      std::vector<int64_t> ends_int64 = static_cast<std::vector<int64_t>>(
+          BOOST_GET_CONST(std::vector<int64_t>, this->GetAttr("ends")));
+      std::vector<int64_t> decrease_axes_int64 =
+          static_cast<std::vector<int64_t>>(BOOST_GET_CONST(
+              std::vector<int64_t>, this->GetAttr("decrease_axes")));
+
+      std::vector<int> axes(axes_int64.begin(), axes_int64.end());
+      std::vector<int> starts(starts_int64.begin(), starts_int64.end());
+      std::vector<int> ends(ends_int64.begin(), ends_int64.end());
+      std::vector<int> decrease_axes(decrease_axes_int64.begin(),
+                                     decrease_axes_int64.end());
+
+      op->SetAttr("axes", axes);
+      op->SetAttr("starts", starts);
+      op->SetAttr("ends", ends);
+      op->SetAttr("decrease_axis", decrease_axes);
+      op->SetAttr("infer_flags", std::vector<int>({}));
+
+      op->SetOutput("Out", this->InputGrad("ValueTensor"));
+    } else {
+      op->SetType("assign");
+      op->SetInput("X", this->OutputGrad("Out"));
+      op->SetOutput("Out", this->InputGrad("Input"));
+    }
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-REGISTER_OPERATOR(
-    set_value, ops::SetValue, ops::SetValueMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
+                  ops::SetValueGradMaker<paddle::framework::OpDesc>,
+                  ops::SetValueGradMaker<paddle::imperative::OpBase>,
+                  ops::SetValueOpInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
     set_value, ops::SetValueKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::SetValueKernel<plat::CPUDeviceContext, int64_t>,
+    ops::SetValueKernel<plat::CPUDeviceContext, float>,
+    ops::SetValueKernel<plat::CPUDeviceContext, double>,
+    ops::SetValueKernel<plat::CPUDeviceContext, bool>);
 
 REGISTER_OP_VERSION(set_value)
     .AddCheckpoint(
@@ -185,4 +241,10 @@ Upgrade set_value, add 3 inputs [StartsTensorList, EndsTensorList, StepsTensorLi
                         "Ending indices of corresponding axis in `axes`.",
                         std::vector<int64_t>{})
             .NewAttr("steps", "Stride step from the start to the end.",
-                     std::vector<int64_t>{}));
+                     std::vector<int64_t>{}))
+    .AddCheckpoint(
+        R"ROC(
+Upgrade set_value, add 1 attribute [decrease_axes].
+              )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewAttr(
+            "decrease_axes", "The axes to decrease.", std::vector<int64_t>{}));
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 325a2b0b865e9d5d6e433da1cd0c767402603239..eca51147f8159e1bcb7c0c88ca7760e4f62e5543 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -106,10 +106,10 @@ inline void CheckAndUpdateSlice(const framework::DDim in_dims,
 }
 
 inline framework::DDim GetSliceDims(const framework::DDim in_dims,
-                                    const std::vector<int64_t> axes,
-                                    const std::vector<int64_t> starts,
-                                    const std::vector<int64_t> ends,
-                                    const std::vector<int64_t> steps) {
+                                    const std::vector<int64_t>& axes,
+                                    const std::vector<int64_t>& starts,
+                                    const std::vector<int64_t>& ends,
+                                    const std::vector<int64_t>& steps) {
   framework::DDim slice_dims(in_dims);
 
   for (size_t i = 0; i < axes.size(); ++i) {
@@ -127,6 +127,38 @@ inline framework::DDim GetSliceDims(const framework::DDim in_dims,
   return slice_dims;
 }
 
+inline framework::DDim GetDecreasedDims(
+    const framework::DDim slice_dims,
+    const std::vector<int64_t>& decrease_axes) {
+  // Get dims after decreasing axes.
+  framework::DDim decreased_dims(slice_dims);
+  if (decrease_axes.size() > 0) {
+    for (size_t i = 0; i < decrease_axes.size(); ++i) {
+      int64_t axis = decrease_axes[i];
+      PADDLE_ENFORCE_EQ(
+          decreased_dims[axis], 1,
+          platform::errors::InvalidArgument("decrease dim should be 1"));
+      decreased_dims[axis] = 0;
+    }
+
+    std::vector<int64_t> new_shape;
+    for (int i = 0; i < decreased_dims.size(); ++i) {
+      if (decreased_dims[i] != 0) {
+        new_shape.push_back(decreased_dims[i]);
+      }
+    }
+
+    // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
+    // uses [1] instead.
+    if (new_shape.size() == 0) {
+      new_shape.push_back(1);
+    }
+
+    decreased_dims = framework::make_ddim(new_shape);
+  }
+  return decreased_dims;
+}
+
 template <typename DeviceContext, typename T>
 class SetValueKernel : public framework::OpKernel<T> {
  public:
@@ -179,6 +211,7 @@ class SetValueKernel : public framework::OpKernel<T> {
     auto ends = ctx.Attr<std::vector<int64_t>>("ends");
     auto steps = ctx.Attr<std::vector<int64_t>>("steps");
     auto shape = ctx.Attr<std::vector<int64_t>>("shape");
+    auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
 
     auto dtype = in->type();
     if (!starts_tensor_list.empty()) {
@@ -194,6 +227,7 @@ class SetValueKernel : public framework::OpKernel<T> {
     auto in_dims = in->dims();
     CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps);
     auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps);
+    auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
 
     auto place = ctx.GetPlace();
     auto& eigen_place =
@@ -212,13 +246,13 @@ class SetValueKernel : public framework::OpKernel<T> {
     // set_value is what we want.
     TensorCopy(*in, place, out);
 
-    Tensor slice_t(dtype), pad_t(dtype);
-    slice_t.mutable_data<T>(slice_dims, place);
-    pad_t.mutable_data<T>(in_dims, place);
+    Tensor slice_tensor(dtype), pad_tensor(dtype);
+    slice_tensor.mutable_data<T>(slice_dims, place);
+    pad_tensor.mutable_data<T>(in_dims, place);
 
-    auto pad_e = framework::EigenTensor<T, D>::From(pad_t, in_dims);
+    auto pad_e = framework::EigenTensor<T, D>::From(pad_tensor, in_dims);
     auto out_e = framework::EigenTensor<T, D>::From(*out);
-    auto slice_e = framework::EigenTensor<T, D>::From(slice_t, slice_dims);
+    auto slice_e = framework::EigenTensor<T, D>::From(slice_tensor, slice_dims);
 
     // Step 1: Set the value of out at `_index` to zero
     slice_e.device(eigen_place) = slice_e.constant(T(0));
@@ -244,11 +278,26 @@ class SetValueKernel : public framework::OpKernel<T> {
 
     // Step 2: Set a tensor with the same shape as out tensor. And its data at
     // '_index' is the same as value_tensor, and data out of '_index' to zero
+
     // - Step 2.1 Set slice tensor with value
+
+    // NOTE(liym27): [ Why resize slice_tensor here? ]
+    // A: When do broadcasting on slice_tensor and value_tensor, the shape of
+    // slice_tensor should be decreased dims.
+    // e.g.
+    //  x[:,0] = value_tensor
+    // x's shape = [3, 4], value_tensor's shape = [3]
+    // We get slice_dims = [3, 1],  decrease_slice_dims = [3]
+    // If do broadcasting on Tensor with shape [3, 1] and [3], the result's
+    // shape is [3, 3], which cross the border;
+    // If do broadcasting on Tensor with shape [3] and [3], the result's shape
+    // is [3], which is right.
+
+    slice_tensor.Resize(decrease_slice_dims);
     if (value_tensor != nullptr) {
       // ElementwiseComputeEx can do broadcasting
       ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_t, value_tensor, -1, SubFunctor<T>(), &slice_t);
+          ctx, &slice_tensor, value_tensor, -1, SubFunctor<T>(), &slice_tensor);
     } else {
       Tensor value_t(dtype);
       auto value_dims = framework::make_ddim(shape);
@@ -257,8 +306,9 @@ class SetValueKernel : public framework::OpKernel<T> {
       CopyVecotorToTensor<T>(value_name.c_str(), &value_t, ctx);
       value_t.Resize(value_dims);
       ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(
-          ctx, &slice_t, &value_t, -1, SubFunctor<T>(), &slice_t);
+          ctx, &slice_tensor, &value_t, -1, SubFunctor<T>(), &slice_tensor);
     }
+    slice_tensor.Resize(slice_dims);
 
     // - Step 2.2 Pad slice tensor with 0
     pad_e.device(eigen_place) = pad_e.constant(T(0));
diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..061849db6ada801cf7728af0c1158dfb75487948
--- /dev/null
+++ b/paddle/fluid/operators/shape_op_npu.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/shape_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = framework::SelectedRows;
+
+template <typename DeviceContext, typename T>
+class ShapeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_var = ctx.InputVar("Input");
+    framework::DDim in_dims;
+    if (in_var->IsType<SelectedRows>()) {
+      in_dims = in_var->Get<SelectedRows>().value().dims();
+    } else {
+      in_dims = in_var->Get<LoDTensor>().dims();
+    }
+    auto* out_t = ctx.Output<Tensor>("Out");
+    out_t->Resize({in_dims.size()});
+    // to do: cpuplace?
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
+    for (int i = 0; i < in_dims.size(); ++i) {
+      out_data[i] = in_dims[i];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    shape, ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, bool>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ShapeNPUKernel<paddle::platform::NPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 9c30c4e07fa774f4da65f1e78bc74252fa3b2e86..22f6fa9e3e6f206b33c46369086d1637fdc83457 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -259,7 +259,20 @@ class SliceKernel : public framework::OpKernel<T> {
     auto out_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
             *out, new_out_dims);
-    out_t.device(place) = in_t.slice(offsets, extents);
+
+    if (in->numel() <= Eigen::NumTraits<int>::highest()) {
+      // similar to tf.slice:
+      // if element number less than INT_MAX, change the type of index to int
+      Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+      for (size_t i = 0; i < D; i++) {
+        offsets_32bit[i] = offsets[i];
+        extents_32bit[i] = extents[i];
+      }
+      framework::To32BitIndex(out_t).device(place) =
+          framework::To32BitIndex(in_t).slice(offsets_32bit, extents_32bit);
+    } else {
+      out_t.device(place) = in_t.slice(offsets, extents);
+    }
 
     out->Resize(out_dims);
   }
@@ -300,8 +313,6 @@ class SliceGradKernel : public framework::OpKernel<T> {
  private:
   template <size_t D>
   void SliceCompute(const framework::ExecutionContext& context) const {
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
     auto axes = context.Attr<std::vector<int>>("axes");
 
     auto starts_int = context.Attr<std::vector<int>>("starts");
@@ -435,13 +446,189 @@ class SliceGradKernel : public framework::OpKernel<T> {
       paddings[i].first = offsets[i];
       paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];
     }
+    EigenPaddingCompute(context, d_input, in_dims, d_out, out_dims, paddings);
+  }
+
+  template <size_t D>
+  void EigenPaddingCompute(
+      const framework::ExecutionContext& context, framework::Tensor* d_input,
+      const framework::DDim& in_dims, const framework::Tensor* d_out,
+      const framework::DDim& out_dims,
+      const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
+    if (D <= 3) {
+      // if dimension less than 3, cannot reduce dimension
+      LaunchEigenPadding(context, d_input, in_dims, d_out, out_dims, paddings);
+    } else {  // else we can reduce dimension
+      // count not-zero padding number, and record the dimension
+      int need_pad_num = 0, pad_dim = -1;
+      for (size_t i = 0; i < D; i++) {
+        if (paddings[i].first != 0 || paddings[i].second != 0) {
+          need_pad_num++;
+          pad_dim = i;
+        }
+      }
+
+      if (need_pad_num == 0) {
+        // do not need padding, pass if data address same, else copy
+        if (d_input->mutable_data<T>(context.GetPlace()) == d_out->data<T>()) {
+          // inplace, do not any operator, pass
+        } else {
+          framework::TensorCopy(
+              *d_out, context.GetPlace(),
+              context.template device_context<platform::DeviceContext>(),
+              d_input);
+        }
+      } else if (need_pad_num == 1) {
+        // only need padding one dimension, we can reduce dimension.
+        // only the padding dimension is available for us.
+        // How to reduce dimension(5 to 3 for example):
+        // before(D=5):
+        // in_dims:        [x1,  x2,  x3,  x4,  x5]
+        // padding.first:  [0,   0,   a,   0,  0]
+        // padding.second: [0,   0,   b,   0,  0]
+        //                     | |
+        //                     V V
+        // after(D=3):
+        // reshaped_in_dims:        [x1*x2,  x3,  x4*x5]
+        // reshaped_padding.first:  [0,      a,     0]
+        // reshaped_padding.second: [0,      b,     0]
+
+        if (pad_dim == D - 1) {
+          // only last dimension need padding,
+          // reshape the dimension of tensor in 2: [preceding, padding]
+          std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
+          Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+
+          // first dimension is the accumulate of preceding dimension
+          for (int i = 0; i < pad_dim; i++) {
+            in_tore_shape[0] *= in_dims[i];
+            out_tore_shape[0] *= out_dims[i];
+          }
+          // second dimension is the padding dimension
+          in_tore_shape[1] = in_dims[pad_dim];
+          out_tore_shape[1] = out_dims[pad_dim];
+
+          // convert array from std::vector to DDim
+          framework::DDim reshaped_in_dims =
+              framework::make_ddim(in_tore_shape);
+          framework::DDim reshaped_out_dims =
+              framework::make_ddim(out_tore_shape);
+
+          // after reshape: the first dimension do not need padding,
+          // set padding[0] zero
+          reshaped_padding[0].first = reshaped_padding[0].second = 0;
+          // the second dimension is the previous padding dimension
+          reshaped_padding[1].first = paddings[pad_dim].first;
+          reshaped_padding[1].second = paddings[pad_dim].second;
+
+          LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out,
+                             reshaped_out_dims, reshaped_padding);
+        } else if (pad_dim == 0) {
+          // only first dimension need padding,
+          // reshape the dimension of tensor in 2: [padding, succeeding]
+          // similar to (D - 1)
+          std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
+          Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+
+          // first dimension is the padding dimension
+          in_tore_shape[0] = in_dims[pad_dim];
+          out_tore_shape[0] = out_dims[pad_dim];
+          // sencond dimension is the accumulate of succeeding dimension
+          for (size_t i = pad_dim + 1; i < D; i++) {
+            in_tore_shape[1] *= in_dims[i];
+            out_tore_shape[1] *= out_dims[i];
+          }
+
+          // convert array from std::vector to DDim
+          framework::DDim reshaped_in_dims =
+              framework::make_ddim(in_tore_shape);
+          framework::DDim reshaped_out_dims =
+              framework::make_ddim(out_tore_shape);
+
+          // after reshape:
+          // the first dimension is the previous padding dimension
+          reshaped_padding[0].first = paddings[pad_dim].first;
+          reshaped_padding[0].second = paddings[pad_dim].second;
+          // the second dimension do not need padding, set padding[1] zero
+          reshaped_padding[1].first = reshaped_padding[1].second = 0;
+
+          LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out,
+                             reshaped_out_dims, reshaped_padding);
+        } else {
+          // other dimension need padding
+          // reshape the dimension of tensor in 3:
+          // [preceding, padding, succeeding]
+          std::vector<int64_t> in_tore_shape(3, 1), out_tore_shape(3, 1);
+          Eigen::array<std::pair<int64_t, int64_t>, 3> reshaped_padding;
+
+          // first dimension is the accumulate of preceding dimension
+          for (int i = 0; i < pad_dim; i++) {
+            in_tore_shape[0] *= in_dims[i];
+            out_tore_shape[0] *= out_dims[i];
+          }
+          // second dimension is the padding dimension
+          in_tore_shape[1] = in_dims[pad_dim];
+          out_tore_shape[1] = out_dims[pad_dim];
+          // third dimension is the accumulate of succeeding dimension
+          for (size_t i = pad_dim + 1; i < D; i++) {
+            in_tore_shape[2] *= in_dims[i];
+            out_tore_shape[2] *= out_dims[i];
+          }
+
+          // convert array from std::vector to DDim
+          framework::DDim reshaped_in_dims =
+              framework::make_ddim(in_tore_shape);
+          framework::DDim reshaped_out_dims =
+              framework::make_ddim(out_tore_shape);
+
+          // after reshape:
+          // the first dimension do not need padding, set padding[0] zero
+          reshaped_padding[0].first = reshaped_padding[2].second = 0;
+          // the second dimension is the previous padding dimension
+          reshaped_padding[1].first = paddings[pad_dim].first;
+          reshaped_padding[1].second = paddings[pad_dim].second;
+          // the third dimension do not need padding, set padding[2] zero
+          reshaped_padding[2].first = reshaped_padding[2].second = 0;
+
+          LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out,
+                             reshaped_out_dims, reshaped_padding);
+        }
+      } else {
+        // need padding at many dimension, cannot reduce dimension
+        LaunchEigenPadding(context, d_input, in_dims, d_out, out_dims,
+                           paddings);
+      }
+    }
+  }
+
+  template <size_t D>
+  void LaunchEigenPadding(
+      const framework::ExecutionContext& context, framework::Tensor* d_input,
+      const framework::DDim& in_dims, const framework::Tensor* d_out,
+      const framework::DDim& out_dims,
+      const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     auto d_in_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *d_input);
+            *d_input, in_dims);
     auto d_out_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
             *d_out, out_dims);
-    d_in_t.device(place) = d_out_t.pad(paddings, T(0));
+
+    if (d_input->numel() <= Eigen::NumTraits<int>::highest()) {
+      // similar to tf.pad:
+      // if element number less than INT_MAX, change the type of index to int
+      Eigen::array<std::pair<int, int>, D> paddings_32bit;
+      for (size_t i = 0; i < D; i++) {
+        paddings_32bit[i] =
+            std::make_pair(paddings[i].first, paddings[i].second);
+      }
+      framework::To32BitIndex(d_in_t).device(place) =
+          framework::To32BitIndex(d_out_t).pad(paddings_32bit, T(0));
+    } else {
+      d_in_t.device(place) = d_out_t.pad(paddings, T(0));
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9974536da9acb401a859c2c9f1d10d79eed680bb
--- /dev/null
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/slice_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+void UpdateAttr(const framework::DDim in_dims, const std::vector<int> axes,
+                const std::vector<int> starts, const std::vector<int> ends,
+                std::vector<int>* offsets, std::vector<int>* size) {
+  int cnt = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    int start = 0;
+    int end = in_dims[i];
+    int axis = axes[cnt];
+
+    if (axis == i) {
+      start = starts[cnt];
+      if (start < 0) {
+        start = (start + in_dims[i]);
+      }
+      start = std::max(start, static_cast<int>(0));
+      end = ends[cnt];
+      if (end < 0) {
+        end = (end + in_dims[i]);
+      }
+      end = std::min(end, static_cast<int>(in_dims[i]));
+      cnt++;
+    }
+
+    (*offsets)[i] = start;
+    (*size)[i] = end - start;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SliceNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dims = input->dims();
+    std::vector<int> offsets(in_dims.size());
+    std::vector<int> size(in_dims.size());
+
+    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
+
+    auto runner = NpuOpRunner("SliceD", {*input}, {*out},
+                              {{"offsets", offsets}, {"size", size}});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SliceGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dinput = ctx.Output<Tensor>(framework::GradVarName("Input"));
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    auto in_dims = input->dims();
+    int rank = in_dims.size();
+
+    std::vector<int> offsets(rank);
+    std::vector<int> size(rank);
+    UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
+
+    std::vector<std::vector<int64_t>> paddings(rank, std::vector<int64_t>(2));
+    for (int i = 0; i < rank; ++i) {
+      paddings[i][0] = static_cast<int64_t>(offsets[i]);
+      paddings[i][1] = static_cast<int64_t>(in_dims[i] - size[i] - offsets[i]);
+    }
+
+    dinput->mutable_data<T>(ctx.GetPlace());
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner =
+        NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    slice, ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    slice_grad,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu b/paddle/fluid/operators/softmax_cudnn_op.cu
index b62d71bdbc4dba43d749c6d7eeb20519908b1822..83b7b78aaec909f7d8924eaf2a2ff46372bbb8c7 100644
--- a/paddle/fluid/operators/softmax_cudnn_op.cu
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
+#include "paddle/fluid/operators/softmax_impl.cuh"
 #include "paddle/fluid/operators/softmax_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #ifdef PADDLE_WITH_HIP
@@ -21,7 +23,6 @@ limitations under the License. */
 #else
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
-#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace platform {
@@ -37,288 +38,414 @@ using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
 
-#define LAUNCH_SOFTMAX_WARP_FORWARD(Log2Elements)                  \
-  case Log2Elements:                                               \
-    WarpSoftmaxForward<T, float, Log2Elements><<<                  \
-        blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
-        out_data, x->data<T>(), N, dim, dim);                      \
-    break;
-
-#define LAUNCH_SOFTMAX_WARP_BACKWARD(Log2Elements)                 \
-  case Log2Elements:                                               \
-    softmax_warp_backward<T, float, Log2Elements><<<               \
-        blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
-        dx_data, mul_grad.data<T>(), out->data<T>(), N, dim, dim); \
-    break;
-
-static inline int SizeOutAxis(const int axis, DDim dims) {
-  int size = 1;
-  for (int i = axis + 1; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-  return size;
-}
-
-int log2_ceil(int value) {
-  int log2_value = 0;
-  while ((1 << log2_value) < value) ++log2_value;
-  return log2_value;
-}
-
-template <typename T, int VLEN>
-union vec_t {
-  static_assert(sizeof(T) == -1, "vec_t is only available by specialization.");
+// Vectorization trait 4 * sizeof(T)
+template <typename T>
+class VecT4 {};
+template <>
+class VecT4<double> {
+ public:
+  using Type = long4;
 };
-
 template <>
-union vec_t<float, 4> {
-  float4 s;
-  float v[4];
+class VecT4<float> {
+ public:
+  using Type = int4;
+};
+template <>
+class VecT4<platform::float16> {
+ public:
+  using Type = int2;
 };
 
+// Vectorization trait 2 * sizeof(T)
+template <typename T>
+class VecT2 {};
 template <>
-union vec_t<platform::float16, 4> {
-  int2 s;
-  platform::float16 v[4];
+class VecT2<double> {
+ public:
+  using Type = int4;
+};
+template <>
+class VecT2<float> {
+ public:
+  using Type = int2;
+};
+template <>
+class VecT2<platform::float16> {
+ public:
+  using Type = int;
 };
 
-template <typename T, typename VECT, int VPT, int WARP_PER_BLOCK>
-__global__ void VecSoftmaxForward(T* dst, const T* src, const int batch_size,
-                                  const int softmax_ele) {
-  int offset = blockIdx.x * softmax_ele * WARP_PER_BLOCK;
-  int idx = threadIdx.x * VPT;
-
-  VECT buf = reinterpret_cast<const VECT*>(&src[offset + idx])[0];
-  T* bufp = reinterpret_cast<T*>(&buf);
-  float4 val4;
-  float* val4p = reinterpret_cast<float*>(&val4);
-  for (int i = 0; i < VPT; ++i) {
-    val4p[i] = static_cast<float>(bufp[i]);
-  }
-  float val = val4.x + val4.y + val4.z + val4.w;
-  float max_val = math::warpReduceMax<float>(
-      max(max(val4.x, val4.y), max(val4.z, val4.w)), 0xffffffff);
-  float4 tmp4 = make_float4(__expf(val4.x - max_val), __expf(val4.y - max_val),
-                            __expf(val4.z - max_val), __expf(val4.w - max_val));
-  float* tmp4p = reinterpret_cast<float*>(&tmp4);
-  float invsum = 1.f / (math::warpReduceSum<float>(
-                            tmp4.x + tmp4.y + tmp4.z + tmp4.w, 0xffffffff) +
-                        1e-6f);
-  for (int i = 0; i < VPT; ++i) {
-    bufp[i] = static_cast<T>(tmp4p[i] * invsum);
-  }
-  reinterpret_cast<VECT*>(&dst[offset + idx])[0] = buf;
+int static inline log2_ceil(int value) {
+  int log2_value = 0;
+  while ((1 << log2_value) < value) ++log2_value;
+  return log2_value;
 }
 
-template <typename T, int WARP_BATCH, int WARP_SIZE_SOFTMAX>
-__device__ __forceinline__ void warp_reduce_sum(T* sum) {
+/*
+Core function of computing softmax forward for axis=-1.
+The computation includes
+  - Compute maximum of batch: maxvalue_{i} = max_j src_{i,j}
+  - Compute sum of exp batch: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} }
+  - Compute: (a_{i,j} - maxvalue_{i}) / s_{i}
+One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
+For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
+api to compute max (sum) in one warp.
+*/
+template <typename T, typename VecT, typename AccT, int Log2Elements,
+          bool LogMode = false>
+__global__ void WarpSoftmaxForward(T* softmax, const T* src,
+                                   const int batch_size, const int stride,
+                                   const int element_count) {
+  constexpr int kDimCeil = 1 << Log2Elements;
+  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  constexpr int kVSize = sizeof(VecT) / sizeof(T);
+  constexpr int kIterations = kDimCeil / kWarpSize;
+  constexpr int kIterationsV =
+      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  constexpr int kBatchSize = (kDimCeil <= 32) ? 2 : 1;
+
+  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
+
+  // max index to read
+  int idx_max_v[kBatchSize];
 #pragma unroll
-  for (int offset = WARP_SIZE_SOFTMAX / 2; offset > 0; offset /= 2) {
-#pragma unroll
-    for (int i = 0; i < WARP_BATCH; ++i) {
-      T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
-      sum[i] = sum[i] + sum_val;
-    }
+  for (int i = 0; i < kBatchSize; i++) {
+    int idx_max = ((i + first_batch) < batch_size) ? element_count : 0;
+    idx_max_v[i] = idx_max / kVSize;
   }
-}
 
-template <typename T, int WARP_BATCH, int WARP_SIZE_SOFTMAX>
-__device__ __forceinline__ void warp_reduce_max(T* sum) {
+  // read data from global memory
+  AccT srcdata[kBatchSize][kIterationsV][kVSize];
+
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+// read data
+#pragma unroll
+    for (int it = 0; it < kIterationsV; ++it) {
+      int src_idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {
+        if (src_idx < idx_max_v[i]) {
+          srcdata[i][it][0] =
+              static_cast<AccT>(src[(first_batch + i) * stride + src_idx]);
+        } else {
+          srcdata[i][it][0] = -std::numeric_limits<AccT>::infinity();
+        }
+      } else {
+        const VecT* src_v =
+            reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
+        if (src_idx < idx_max_v[i]) {
+          VecT srctmp = src_v[src_idx];
+          const T* srcinptr = reinterpret_cast<const T*>(&srctmp);
 #pragma unroll
-  for (int offset = WARP_SIZE_SOFTMAX / 2; offset > 0; offset /= 2) {
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = static_cast<AccT>(srcinptr[s]);
+          }
+        } else {
 #pragma unroll
-    for (int i = 0; i < WARP_BATCH; ++i) {
-      T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
-      sum[i] = max(sum[i], max_val);
+          for (int s = 0; s < kVSize; s++) {
+            srcdata[i][it][s] = -std::numeric_limits<AccT>::infinity();
+          }
+        }
+      }
     }
   }
-}
-
-template <typename T, typename AccT, int Log2Elements>
-__global__ void WarpSoftmaxForward(T* dst, const T* src, const int batch_size,
-                                   const int stride, const int element_count) {
-  constexpr int next_power_of_two = 1 << Log2Elements;
-  constexpr int warp_size_softmax =
-      (next_power_of_two < 32) ? next_power_of_two : 32;
-  constexpr int WARP_ITERATIONS = next_power_of_two / warp_size_softmax;
-  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
 
-  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
-
-  int local_batches = batch_size - first_batch;
-  if (local_batches > WARP_BATCH) {
-    local_batches = WARP_BATCH;
-  }
-
-  int local_idx = threadIdx.x;
-
-  src += first_batch * stride + local_idx;
-  dst += first_batch * stride + local_idx;
+  // compute max value
+  AccT max_value[kBatchSize];
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    AccT valmax = srcdata[i][0][0];
+#pragma unroll
+    for (int s = 1; s < kVSize; ++s) {
+      valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s];
+    }
+    max_value[i] = valmax;
 
-  // load data from global memory
-  AccT elements[WARP_BATCH][WARP_ITERATIONS];
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    int batch_element_count = (i >= local_batches) ? 0 : element_count;
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < batch_element_count) {
-        elements[i][it] =
-            static_cast<float>(src[i * element_count + it * warp_size_softmax]);
-      } else {
-        elements[i][it] = -std::numeric_limits<AccT>::infinity();
+// it = 1, 2, ...
+#pragma unroll
+    for (int it = 1; it < kIterationsV; ++it) {
+      AccT valmax = srcdata[i][it][0];
+#pragma unroll
+      for (int s = 1; s < kVSize; ++s) {
+        valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s];
       }
+      max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
     }
   }
+  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
 
-  // compute max_value
-  AccT max_value[WARP_BATCH];
+  // compute sum
+  AccT sum[kBatchSize];
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    max_value[i] = elements[i][0];
+  for (int i = 0; i < kBatchSize; ++i) {
+    // it = 0
+    if (LogMode) {
+      sum[i] = std::exp(srcdata[i][0][0] - max_value[i]);
+    } else {
+      srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]);
+      sum[i] = srcdata[i][0][0];
+    }
 #pragma unroll
-    for (int it = 1; it < WARP_ITERATIONS; ++it) {
-      max_value[i] =
-          (max_value[i] > elements[i][it]) ? max_value[i] : elements[i][it];
+    for (int s = 1; s < kVSize; ++s) {
+      if (LogMode) {
+        sum[i] += std::exp(srcdata[i][0][s] - max_value[i]);
+      } else {
+        srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]);
+        sum[i] += srcdata[i][0][s];
+      }
     }
-  }
-  warp_reduce_max<AccT, WARP_BATCH, warp_size_softmax>(max_value);
 
-  AccT sum[WARP_BATCH]{0.0f};
+// it = 1, 2, ...
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
+    for (int it = 1; it < kIterationsV; ++it) {
 #pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      elements[i][it] = (std::exp((elements[i][it] - max_value[i])));
-      sum[i] += elements[i][it];
+      for (int s = 0; s < kVSize; ++s) {
+        if (LogMode) {
+          sum[i] += std::exp(srcdata[i][it][s] - max_value[i]);
+        } else {
+          srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]);
+          sum[i] += srcdata[i][it][s];
+        }
+      }
     }
   }
-  warp_reduce_sum<AccT, WARP_BATCH, warp_size_softmax>(sum);
+  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
-// store result
+// write result to global memory
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    if (i >= local_batches) break;
+  for (int i = 0; i < kBatchSize; ++i) {
+    if (LogMode) {
+      sum[i] = std::log(sum[i]);
+    }
+
 #pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < element_count) {
-        dst[i * element_count + it * warp_size_softmax] =
-            elements[i][it] / sum[i];
+    for (int it = 0; it < kIterationsV; ++it) {
+      int idx = threadIdx.x + it * kWarpSize;
+      if (kVSize == 1) {
+        if (idx < idx_max_v[i]) {
+          if (LogMode) {
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] - max_value[i] - sum[i];
+          } else {
+            softmax[(first_batch + i) * stride + idx] =
+                srcdata[i][it][0] / sum[i];
+          }
+        } else {
+          break;
+        }
       } else {
-        break;
+        VecT* softmax_v =
+            reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
+        VecT tmpdata;
+        T* tmpptr = reinterpret_cast<T*>(&tmpdata);
+#pragma unroll
+        for (int s = 0; s < kVSize; ++s) {
+          if (LogMode) {
+            tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i];
+          } else {
+            tmpptr[s] = srcdata[i][it][s] / sum[i];
+          }
+        }
+
+        if (idx < idx_max_v[i]) {
+          softmax_v[idx] = tmpdata;
+        } else {
+          break;
+        }
       }
     }
   }
 }
 
-template <typename T, typename AccT, int Log2Elements>
-__global__ void softmax_warp_backward(T* gradInput, const T* grad,
-                                      const T* output, int batch_size,
-                                      int stride, int element_count) {
-  constexpr int next_power_of_two = 1 << Log2Elements;
-  constexpr int warp_size_softmax =
-      (next_power_of_two < 32) ? next_power_of_two : 32;
-  constexpr int WARP_ITERATIONS = next_power_of_two / warp_size_softmax;
-  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
-
-  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * WARP_BATCH;
-
+/*
+Core function of computing softmax backward for axis=-1.
+The computation includes
+  - Compute sum of exp batch: s_{i} = sum_{j} {src_{i,j} * grad_{i,j}
+  - Compute src_{i,j} * ( grad_{i,j}) - s_{i} )
+One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
+For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
+api to compute max (sum) in one warp.
+*/
+template <typename T, typename VecT, typename AccT, int Log2Elements,
+          bool LogMode = false>
+__global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src,
+                                    int batch_size, int stride,
+                                    int element_count) {
+  constexpr int kVSize = sizeof(VecT) / sizeof(T);
+  constexpr int kDimCeil = 1 << Log2Elements;
+  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+  constexpr int kIterations = kDimCeil / kWarpSize;
+  constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1;
+  constexpr int kIterationsV =
+      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
+  int element_count_v = element_count / kVSize;
+
+  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
   int local_batches = batch_size - first_batch;
-  if (local_batches > WARP_BATCH) {
-    local_batches = WARP_BATCH;
+  if (local_batches > kBatchSize) {
+    local_batches = kBatchSize;
   }
 
-  int local_idx = threadIdx.x % warp_size_softmax;
-
-  int thread_offset = first_batch * stride + local_idx;
-  grad += thread_offset;
-  output += thread_offset;
-  gradInput += thread_offset;
-
-  // load data from global memory
-  AccT grad_reg[WARP_BATCH][WARP_ITERATIONS];
-  AccT output_reg[WARP_BATCH][WARP_ITERATIONS];
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    int batch_element_count = (i >= local_batches) ? 0 : element_count;
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < batch_element_count) {
-        grad_reg[i][it] =
-            static_cast<AccT>(grad[i * element_count + it * warp_size_softmax]);
-        output_reg[i][it] = static_cast<AccT>(
-            output[i * element_count + it * warp_size_softmax]);
+  // read data from global memory
+  VecT src_reg[kBatchSize][kIterationsV];
+  VecT grad_reg[kBatchSize][kIterationsV];
+
+  for (int i = 0; i < kBatchSize; ++i) {
+    const VecT* src_v =
+        reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
+    const VecT* grad_v =
+        reinterpret_cast<const VecT*>(&grad[(first_batch + i) * stride]);
+
+    // max index to read
+    int idx_max = (i < local_batches) ? element_count : 0;
+    int idx_max_v = idx_max / kVSize;
+
+    // read data
+    for (int it = 0; it < kIterationsV; ++it) {
+      int src_idx = threadIdx.x + it * kWarpSize;
+      if (src_idx < idx_max_v) {
+        src_reg[i][it] = src_v[src_idx];
+        grad_reg[i][it] = grad_v[src_idx];
       } else {
-        grad_reg[i][it] = AccT(0);
-        output_reg[i][it] = AccT(0);
+#pragma unroll
+        for (int s = 0; s < kVSize; s++) {
+          reinterpret_cast<T*>(&src_reg[i][it])[s] = 0.0;
+          reinterpret_cast<T*>(&grad_reg[i][it])[s] = 0.0;
+        }
       }
     }
   }
 
-  AccT sum[WARP_BATCH];
+  // compute sum
+  AccT sum[kBatchSize]{0.0};
+#pragma unroll
+  for (int i = 0; i < kBatchSize; ++i) {
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
-    sum[i] = grad_reg[i][0];
+    for (int it = 0; it < kIterationsV; ++it) {
+      T* gradptr = reinterpret_cast<T*>(&grad_reg[i][it]);
+      T* srcptr = reinterpret_cast<T*>(&src_reg[i][it]);
 #pragma unroll
-    for (int it = 1; it < WARP_ITERATIONS; ++it) {
-      sum[i] += grad_reg[i][it];
+      for (int s = 0; s < kVSize; ++s) {
+        if (LogMode) {
+          sum[i] += static_cast<AccT>(gradptr[s]);
+        } else {
+          sum[i] += static_cast<AccT>(gradptr[s] * srcptr[s]);
+        }
+      }
     }
   }
-  warp_reduce_sum<AccT, WARP_BATCH, warp_size_softmax>(sum);
+  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
 
-// store result
+// write result
 #pragma unroll
-  for (int i = 0; i < WARP_BATCH; ++i) {
+  for (int i = 0; i < kBatchSize; ++i) {
     if (i >= local_batches) break;
+
+    VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
+
+    // max index to write
+    int idx_max = (i < local_batches) ? element_count : 0;
+    int idx_max_v = idx_max / kVSize;
+
 #pragma unroll
-    for (int it = 0; it < WARP_ITERATIONS; ++it) {
-      int element_index = local_idx + it * warp_size_softmax;
-      if (element_index < element_count) {
-        // compute gradients
-        gradInput[i * element_count + it * warp_size_softmax] =
-            (grad_reg[i][it] - output_reg[i][it] * sum[i]);
+    for (int it = 0; it < kIterationsV; ++it) {
+      VecT tmpdata;
+      T* tmpptr = reinterpret_cast<T*>(&tmpdata);
+      T* gradptr = reinterpret_cast<T*>(&grad_reg[i][it]);
+      T* srcptr = reinterpret_cast<T*>(&src_reg[i][it]);
+#pragma unroll
+      for (int s = 0; s < kVSize; ++s) {
+        if (LogMode) {
+          tmpptr[s] = static_cast<AccT>(gradptr[s]) -
+                      std::exp(static_cast<AccT>(srcptr[s])) * sum[i];
+        } else {
+          tmpptr[s] = static_cast<AccT>(srcptr[s]) *
+                      (static_cast<AccT>(gradptr[s]) - sum[i]);
+        }
+      }
+
+      int idx = threadIdx.x + it * kWarpSize;
+      if (idx < idx_max_v) {
+        dst_v[idx] = tmpdata;
       }
     }
   }
 }
 
-template <typename T>
-__global__ void MultiplyCUDAKernel(T* C, const T* A, const T* B, int N) {
-  CUDA_KERNEL_LOOP(i, N) {
-    C[i] = static_cast<T>(static_cast<float>(A[i]) * static_cast<float>(B[i]));
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT)                         \
+  case Log2Elements:                                                          \
+    WarpSoftmaxForward<                                                       \
+        T, VecT, AccT, Log2Elements,                                          \
+        LogMode><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
+        dst, src, batch_size, stride, element_count);                         \
+    break;
+
+/*
+  Wrapper of softmax formward with template instantiation on size of input.
+*/
+template <typename T, typename VecT, bool LogMode>
+void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads,
+                              const framework::ExecutionContext& ctx, T* dst,
+                              const T* src, const int batch_size,
+                              const int stride, const int element_count,
+                              int Log2Elements) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  switch (Log2Elements) {
+    SOFTMAX_WARP_FORWARD_CASE(0, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(1, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(2, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(3, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(4, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(5, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(6, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(7, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(8, AccT);
+    SOFTMAX_WARP_FORWARD_CASE(9, AccT);
+    default:
+      break;
   }
 }
 
-template <typename T, int VPT, int WARP_PER_BLOCK>
-__global__ void VecSoftmaxBackward(T* dst, const T* grad, const T* src,
-                                   const int batch_size,
-                                   const int softmax_ele) {
-  const int offset =
-      blockIdx.x * softmax_ele * WARP_PER_BLOCK + threadIdx.x * VPT;
-
-  float local_sum_gy = 0.f;
-  vec_t<T, VPT> local_grad;
-  vec_t<T, VPT> local_src;
-
-  local_grad.s =
-      reinterpret_cast<const decltype(local_grad.s)*>(&grad[offset])[0];
-  local_src.s = reinterpret_cast<const decltype(local_src.s)*>(&src[offset])[0];
-
-  for (int i = 0; i < VPT; ++i) {
-    local_sum_gy += static_cast<float>(local_grad.v[i]) *
-                    static_cast<float>(local_src.v[i]);
-  }
-  float sum_gy = math::warpReduceSum<float>(local_sum_gy, 0xffffffff);
+#define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT)                        \
+  case Log2Elements:                                                          \
+    WarpSoftmaxBackward<                                                      \
+        T, VecT, AccT, Log2Elements,                                          \
+        LogMode><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>( \
+        dst, grad, src, batch_size, stride, element_count);                   \
+    break;
 
-  vec_t<T, VPT> local_dst;
-  for (int i = 0; i < VPT; ++i) {
-    local_dst.v[i] =
-        static_cast<T>(static_cast<float>(local_src.v[i]) *
-                       (static_cast<float>(local_grad.v[i]) - sum_gy));
+/*
+Wrapper of softmax backward with template instantiation on size of input.
+*/
+template <typename T, typename VecT, bool LogMode>
+void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads,
+                               const framework::ExecutionContext& ctx, T* dst,
+                               const T* grad, const T* src,
+                               const int batch_size, const int stride,
+                               const int element_count, int Log2Elements) {
+  using AccT = typename details::MPTypeTrait<T>::Type;
+  switch (Log2Elements) {
+    SOFTMAX_WARP_BACKWARD_CASE(0, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(1, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(2, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(3, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(4, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(5, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(6, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(7, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(8, AccT);
+    SOFTMAX_WARP_BACKWARD_CASE(9, AccT);
+    default:
+      break;
   }
-  reinterpret_cast<decltype(local_dst.s)*>(&dst[offset])[0] = local_dst.s;
 }
 
-template <typename T>
+#undef SOFTMAX_WARP_FORWARD_CASE
+#undef SOFTMAX_WARP_BACKWARD_CASE
+
+template <typename T, bool LogMode = false>
 class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -335,60 +462,39 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
     const int D = SizeOutAxis(axis, dims);
 
     constexpr int max_dim = 320;
-    bool optimize = false;
     constexpr int warps_per_block = 4;
+
     if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
-      if (dim == 128 && N % warps_per_block == 0) {
-        optimize = true;
-        // a warp for a batch, 4 elements for a thread, only support the softmax
-        // dim size = 128 currently
-        if (sizeof(T) == 2) {
-          VecSoftmaxForward<T, int2, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(out_data, x->data<T>(), N,
-                                                    dim);
-        } else if (sizeof(T) == 4) {
-          VecSoftmaxForward<T, int4, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(out_data, x->data<T>(), N,
-                                                    dim);
-        } else {
-          assert(false && "not support");
-        }
-      } else if (dim < max_dim) {
-        optimize = true;
-        int log2_elements = static_cast<int>(log2_ceil(dim));
-        const int next_power_of_two = 1 << log2_elements;
-
-        int warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;
-
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        // use 128 threads per block to maximimize gpu utilization
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        int blocks = (N + batches_per_block - 1) / batches_per_block;
-        dim3 threads(warp_size, warps_per_block, 1);
-
-        switch (log2_elements) {
-          LAUNCH_SOFTMAX_WARP_FORWARD(0);  // 1
-          LAUNCH_SOFTMAX_WARP_FORWARD(1);  // 2
-          LAUNCH_SOFTMAX_WARP_FORWARD(2);  // 4
-          LAUNCH_SOFTMAX_WARP_FORWARD(3);  // 8
-          LAUNCH_SOFTMAX_WARP_FORWARD(4);  // 16
-          LAUNCH_SOFTMAX_WARP_FORWARD(5);  // 32
-          LAUNCH_SOFTMAX_WARP_FORWARD(6);  // 64
-          LAUNCH_SOFTMAX_WARP_FORWARD(7);  // 128
-          LAUNCH_SOFTMAX_WARP_FORWARD(8);  // 256
-          LAUNCH_SOFTMAX_WARP_FORWARD(9);  // 512
-          default:
-            break;
-        }
+      const int kDimLog2 = static_cast<int>(log2_ceil(dim));
+      const int kDimCeil = 1 << kDimLog2;
+      int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+      int batches_per_warp = (kDimCeil <= 32) ? 2 : 1;
+
+      // use 128 threads per block to maximimize gpu utilization
+      constexpr int threads_per_block = 128;
+
+      int warps_per_block = (threads_per_block / kWarpSize);
+      int batches_per_block = warps_per_block * batches_per_warp;
+      int blocks = (N + batches_per_block - 1) / batches_per_block;
+      dim3 threads(kWarpSize, warps_per_block, 1);
+
+      // vectorization read/write
+      using T4 = typename VecT4<T>::Type;
+      using T2 = typename VecT2<T>::Type;
+      if (dim % 4 == 0) {
+        SwitchWarpSoftmaxForward<T, T4, LogMode>(blocks, threads, ctx, out_data,
+                                                 x->data<T>(), N, dim, dim,
+                                                 kDimLog2);
+      } else if (dim % 2 == 0) {
+        SwitchWarpSoftmaxForward<T, T2, LogMode>(blocks, threads, ctx, out_data,
+                                                 x->data<T>(), N, dim, dim,
+                                                 kDimLog2);
+      } else {
+        SwitchWarpSoftmaxForward<T, T, LogMode>(blocks, threads, ctx, out_data,
+                                                x->data<T>(), N, dim, dim,
+                                                kDimLog2);
       }
-    }
-    if (!optimize) {
+    } else {
       ScopedTensorDescriptor desc;
       std::vector<int> tensor_dims = {N, dim, D, 1};
       DataLayout layout = DataLayout::kNCHW;
@@ -405,22 +511,37 @@ class SoftmaxCUDNNKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                    : MIOPEN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, out_data,
+            MIOPEN_SOFTMAX_LOG, mode));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, out_data,
+            MIOPEN_SOFTMAX_ACCURATE, mode));
+      }
 #else
       auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                    : CUDNN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
-          platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+            handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
+            desc_, x->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            out_data));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
+            handle, CUDNN_SOFTMAX_ACCURATE, mode,
+            platform::CudnnDataType<T>::kOne(), desc_, x->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, out_data));
+      }
 #endif
     }
   }
 };
 
-template <typename T>
+template <typename T, bool LogMode = false>
 class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -437,78 +558,38 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
     const int N = SizeToAxis(axis, dims);
     const int D = SizeOutAxis(axis, dims);
 
+    constexpr int max_dim = 320;
     constexpr int warps_per_block = 4;
-    constexpr bool warp_softmax_available =
-        std::is_same<T, float>::value ||
-        std::is_same<T, platform::float16>::value;
-    bool optimize = false;
-    if (D == 1 && warp_softmax_available) {
-      if (dim == 128 && N % warps_per_block == 0) {
-        optimize = true;
-        if (std::is_same<T, float>::value) {
-          VecSoftmaxBackward<float, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(dx->data<float>(),
-                                                    dout->data<float>(),
-                                                    out->data<float>(), N, dim);
-        } else if (std::is_same<T, platform::float16>::value) {
-          VecSoftmaxBackward<platform::float16, 4, warps_per_block><<<
-              N / warps_per_block, warps_per_block * WARP_SIZE, 0,
-              ctx.cuda_device_context().stream()>>>(
-              dx->data<platform::float16>(), dout->data<platform::float16>(),
-              out->data<platform::float16>(), N, dim);
-        } else {
-          PADDLE_ENFORCE_EQ(
-              warp_softmax_available, true,
-              platform::errors::Unimplemented(
-                  "Warp softmax backward is only available for fp32 and fp16"));
-        }
-      } else if (dim < 40 && dim % 32 != 0) {
-        optimize = true;
-        Tensor mul_grad;
-        int numel = N * dim;
-        mul_grad.mutable_data<T>({numel}, ctx.GetPlace());
-
-        auto stream = ctx.cuda_device_context().stream();
-        auto& dev_ctx =
-            ctx.template device_context<platform::CUDADeviceContext>();
-        auto config = GetGpuLaunchConfig1D(dev_ctx, numel);
-
-        MultiplyCUDAKernel<T><<<config.block_per_grid.x,
-                                config.thread_per_block.x, 0, stream>>>(
-            mul_grad.data<T>(), dout->data<T>(), out->data<T>(), numel);
-
-        int log2_elements = log2_ceil(dim);
-        const int next_power_of_two = 1 << log2_elements;
-
-        int warp_size = (next_power_of_two < 32) ? next_power_of_two : 32;
-
-        int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
-
-        constexpr int threads_per_block = 128;
-
-        int warps_per_block = (threads_per_block / warp_size);
-        int batches_per_block = warps_per_block * batches_per_warp;
-        int blocks = (N + batches_per_block - 1) / batches_per_block;
-        dim3 threads(warp_size, warps_per_block, 1);
-
-        switch (log2_elements) {
-          LAUNCH_SOFTMAX_WARP_BACKWARD(0);  // 1
-          LAUNCH_SOFTMAX_WARP_BACKWARD(1);  // 2
-          LAUNCH_SOFTMAX_WARP_BACKWARD(2);  // 4
-          LAUNCH_SOFTMAX_WARP_BACKWARD(3);  // 8
-          LAUNCH_SOFTMAX_WARP_BACKWARD(4);  // 16
-          LAUNCH_SOFTMAX_WARP_BACKWARD(5);  // 32
-          LAUNCH_SOFTMAX_WARP_BACKWARD(6);  // 64
-          LAUNCH_SOFTMAX_WARP_BACKWARD(7);  // 128
-          LAUNCH_SOFTMAX_WARP_BACKWARD(8);  // 256
-          LAUNCH_SOFTMAX_WARP_BACKWARD(9);  // 512
-          default:
-            break;
-        }
+
+    if (D == 1 && dim <= max_dim && sizeof(T) <= 4) {
+      const int kDimLog2 = log2_ceil(dim);
+      const int kDimCeil = 1 << kDimLog2;
+      int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
+      int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
+      constexpr int threads_per_block = 128;
+
+      int warps_per_block = (threads_per_block / kWarpSize);
+      int batches_per_block = warps_per_block * batches_per_warp;
+      int blocks = (N + batches_per_block - 1) / batches_per_block;
+      dim3 threads(kWarpSize, warps_per_block, 1);
+
+      // vectorization read/write
+      using T4 = typename VecT4<T>::Type;
+      using T2 = typename VecT2<T>::Type;
+      if (dim % 4 == 0) {
+        SwitchWarpSoftmaxBackward<T, T4, LogMode>(
+            blocks, threads, ctx, dx_data, dout->data<T>(), out->data<T>(), N,
+            dim, dim, kDimLog2);
+      } else if (dim % 2 == 0) {
+        SwitchWarpSoftmaxBackward<T, T2, LogMode>(
+            blocks, threads, ctx, dx_data, dout->data<T>(), out->data<T>(), N,
+            dim, dim, kDimLog2);
+      } else {
+        SwitchWarpSoftmaxBackward<T, T, LogMode>(
+            blocks, threads, ctx, dx_data, dout->data<T>(), out->data<T>(), N,
+            dim, dim, kDimLog2);
       }
-    }
-    if (!optimize) {
+    } else {
       ScopedTensorDescriptor desc;
       std::vector<int> tensor_dims = {N, dim, D, 1};
       DataLayout layout = DataLayout::kNCHW;
@@ -525,18 +606,32 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
                                    : MIOPEN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward(
-          handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
-          desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
+            desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            dx_data, MIOPEN_SOFTMAX_LOG, mode));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
+            handle, platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(),
+            desc_, dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            dx_data, MIOPEN_SOFTMAX_ACCURATE, mode));
+      }
 #else
       auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                    : CUDNN_SOFTMAX_MODE_CHANNEL;
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
-          handle, CUDNN_SOFTMAX_ACCURATE, mode,
-          platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
-          dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
-          dx_data));
+      if (LogMode) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+            handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
+            desc_, out->data<T>(), desc_, dout->data<T>(),
+            platform::CudnnDataType<T>::kZero(), desc_, dx_data));
+      } else {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
+            handle, CUDNN_SOFTMAX_ACCURATE, mode,
+            platform::CudnnDataType<T>::kOne(), desc_, out->data<T>(), desc_,
+            dout->data<T>(), platform::CudnnDataType<T>::kZero(), desc_,
+            dx_data));
+      }
 #endif
     }
   }
diff --git a/paddle/fluid/operators/softmax_impl.cuh b/paddle/fluid/operators/softmax_impl.cuh
new file mode 100755
index 0000000000000000000000000000000000000000..2acc55d2398e99db465f5eeccb7972c456d55a33
--- /dev/null
+++ b/paddle/fluid/operators/softmax_impl.cuh
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/cuda_device_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int BatchSize, int WarpSize>
+__device__ __forceinline__ void WarpReduceSum(T* sum) {
+#pragma unroll
+  for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < BatchSize; ++i) {
+      T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      sum[i] = sum[i] + sum_val;
+    }
+  }
+}
+
+template <typename T, int BatchSize, int WarpSize>
+__device__ __forceinline__ void WarpReduceMax(T* sum) {
+#pragma unroll
+  for (int offset = WarpSize / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < BatchSize; ++i) {
+      T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset);
+      sum[i] = max(sum[i], max_val);
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index a21ef252c03f7c279f5edc8c557758e4f9e1e822..5e7244f4390d84fbcb31f833b5e11bff637f0e7f 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -83,11 +83,13 @@ class SoftmaxOp : public framework::OperatorWithKernel {
     }
 #endif
 
+#ifndef PADDLE_WITH_ASCEND_CL
     if (input_data_type == framework::proto::VarType::FP16) {
       PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
                         platform::errors::InvalidArgument(
                             "float16 can only be used on GPU place"));
     }
+#endif
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                    library_);
@@ -207,9 +209,10 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
     }
 #endif
     if (input_data_type == framework::proto::VarType::FP16) {
-      PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
-                        platform::errors::InvalidArgument(
-                            "float16 can only be used on GPU place"));
+      if (!(platform::is_gpu_place(ctx.GetPlace()) ||
+            platform::is_npu_place(ctx.GetPlace())))
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "float16 can only be used on GPU/NPU place"));
     }
 
     return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index a964c3b57a635b3e5f0a4c163e3b3c13d465102b..08266318fb970ba976269991351152c22b38dbf2 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -45,6 +45,14 @@ static inline int SizeFromAxis(const int axis, DDim dims) {
   return size;
 }
 
+static inline int SizeOutAxis(const int axis, DDim dims) {
+  int size = 1;
+  for (int i = axis + 1; i < dims.size(); i++) {
+    size *= dims[i];
+  }
+  return size;
+}
+
 template <typename DeviceContext, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e94f6af232f98e093953e1aee37306eb460211d
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_npu.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SoftmaxNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto axis = ctx.Attr<int>("axis");
+    std::vector<int> axes;
+    axes.push_back(axis);
+    framework::NPUAttributeMap attr_input = {{"axes", axes}};
+
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<framework::LoDTensor>("Out");
+    auto* dOut = ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+
+    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto dims = dX->dims();
+    const int rank = dims.size();
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    int64_t first_dim = 1;
+    int64_t sec_dim = 1;
+    for (int i = 0; i < axis; i++) {
+      first_dim *= dims[i];
+    }
+    for (int i = axis; i < rank; i++) {
+      sec_dim *= dims[i];
+    }
+
+    Tensor tmp_out;
+    tmp_out.ShareDataWith(*out).Resize({first_dim, sec_dim});
+
+    Tensor tmp_dOut;
+    tmp_dOut.ShareDataWith(*dOut).Resize({first_dim, sec_dim});
+
+    dX->Resize(framework::make_ddim({first_dim, sec_dim}));
+    dX->mutable_data<T>(ctx.GetPlace());
+
+    framework::NPUAttributeMap attr_input = {};
+    auto runner = NpuOpRunner(std::string("SoftmaxGrad"), {tmp_out, tmp_dOut},
+                              {*dX}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+
+    dX->Resize(dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    softmax, ops::SoftmaxNPUKernel<plat::NPUDeviceContext, float>,
+    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, double>,
+    ops::SoftmaxNPUKernel<plat::NPUDeviceContext, plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, float>,
+    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext, double>,
+    ops::SoftmaxGradNPUKernel<plat::NPUDeviceContext,
+                              paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d20b3ac04bf95cced0c6a3cf0db8a69d8e166ec9
--- /dev/null
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(softmax);
+USE_OP_DEVICE_KERNEL(softmax, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  for (int i = 3; i < 9; ++i) {
+    init.push_back(static_cast<T>(i));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({2, 3});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({2, 3});
+  tensor_out->mutable_data<T>(place);  // allocate
+
+  // run
+  int axis = 1;
+  f::AttributeMap attrs = {
+      {"axis", axis},        {"use_cudnn", false},
+      {"use_mkldnn", false}, {"mkldnn_data_type", std::string("float32")},
+      {"is_test", false},
+  };
+
+  auto op = f::OpRegistry::CreateOp("softmax", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
+    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
+  }
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_init;
+
+  out_init.push_back(static_cast<T>(0.6670));
+  out_init.push_back(static_cast<T>(0.5888));
+  out_init.push_back(static_cast<T>(0.4543));
+  out_init.push_back(static_cast<T>(0.3330));
+  out_init.push_back(static_cast<T>(0.4112));
+  out_init.push_back(static_cast<T>(0.5457));
+
+  TensorFromVector(out_init, ctx, tensor_out);
+  tensor_out->Resize({2, 3});
+
+  ctx.Wait();
+
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+
+  std::vector<T> dout_init;
+  for (int i = 0; i < 6; ++i) {
+    dout_init.push_back(static_cast<T>(1.0));
+  }
+
+  TensorFromVector(dout_init, ctx, tensor_dout);
+  tensor_dout->Resize({2, 3});
+
+  ctx.Wait();
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs = {
+      {"name", std::string("softmax_grad")},
+      {"axis", static_cast<int>(0)},
+      {"use_cudnn", false},
+      {"use_mkldnn", false},
+      {"mkldnn_data_type", std::string("float32")},
+      {"is_test", false},
+      {"data_format", std::string("AnyLayout")},
+  };
+  auto op = f::OpRegistry::CreateOp("softmax_grad",
+                                    {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}},
+                                    {{"X@GRAD", {"DX"}}}, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)tensor_dx->dims()[0], (uint32_t)(2));
+  EXPECT_EQ((uint32_t)tensor_dx->dims()[1], (uint32_t)(3));
+
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_dx, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
+  EXPECT_NEAR((float)out_vec[0], (float)(-0.4737), 0.1);
+  EXPECT_NEAR((float)out_vec[1], (float)(-0.4181), 0.1);
+  EXPECT_NEAR((float)out_vec[2], (float)(-0.3226), 0.1);
+  EXPECT_NEAR((float)out_vec[3], (float)(-0.0965), 0.1);
+  EXPECT_NEAR((float)out_vec[4], (float)(-0.1192), 0.1);
+  EXPECT_NEAR((float)out_vec[5], (float)(-0.1582), 0.1);
+}
+
+TEST(softmax, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
+
+TEST(softmax_grad, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index e58b39252ce5f443ca473ef7a720881e375bb0b7..fbaf76d4e7cd89ea75a271dc4c5c658ea910808a 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -55,7 +55,7 @@ class SoftmaxWithCrossEntropyOpMaker
         "the given labels as soft labels.")
         .SetDefault(false);
     AddAttr<bool>(
-        "softmax_switch",
+        "use_softmax",
         "(bool, default: true), A flag to indicate whether to do softmax ")
         .SetDefault(true);
     AddAttr<bool>(
@@ -320,7 +320,6 @@ REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
 REGISTER_OP_VERSION(softmax_with_cross_entropy)
     .AddCheckpoint(
         R"ROC(
-              Add a new attribute [softmax_switch] )ROC",
+              Add a new attribute [use_softmax] )ROC",
         paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "softmax_switch", "A flag to indicate whether to do softmax",
-            true));
+            "use_softmax", "A flag to indicate whether to do softmax", true));
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index eaded93cce70c390150398cd1030b37acc596bb4..4aec4c174227921d6b396033d26550145dbd6bb2 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -452,12 +452,7 @@ struct HardLabelCrossEntropyFunctorWithIgnoreIdx {
     // labels, loss view as [n, remain]
     int idx_lbl = idx_n * remain + idx_remain;
 
-    if (idx_axis == ignore_idx_) {
-      loss_[idx_lbl] = 0;
-      return;
-    }
-
-    if (idx_axis == labels_[idx_lbl]) {
+    if (idx_axis == labels_[idx_lbl] && idx_axis != ignore_idx_) {
       loss_[idx_lbl] = -log_on_device(logits_data_[idx]);
     }
   }
@@ -677,7 +672,11 @@ template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(
     const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data,
     int64_t n, int64_t d, int axis_dim, gpuStream_t stream) {
+#ifdef __HIPCC__
+  constexpr int kMaxBlockDim = 256;
+#else
   constexpr int kMaxBlockDim = 512;
+#endif
   int64_t block_dim = axis_dim >= kMaxBlockDim
                           ? kMaxBlockDim
                           : (1 << static_cast<int>(std::log2(axis_dim)));
@@ -732,7 +731,7 @@ static void SoftmaxWithCrossEntropyFusedKernel(
 template <typename T>
 static void CrossEntropyFusedKernel(const T* logits_data, const T* labels_data,
                                     T* loss_data, int n, int d, int axis_dim,
-                                    cudaStream_t stream) {
+                                    gpuStream_t stream) {
   constexpr int kMaxBlockDim = 512;
   int block_dim = axis_dim >= kMaxBlockDim
                       ? kMaxBlockDim
@@ -773,10 +772,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
         platform::is_gpu_place(context.GetPlace()), true,
         platform::errors::Unavailable("softmax_with_cross_entropy operator's "
                                       "CUDA kernel only runs on GPU device."));
-    const bool softmax_switch = context.Attr<bool>("softmax_switch");
+    const bool use_softmax = context.Attr<bool>("use_softmax");
 
     // do not with softmax op, and input is softmax
-    if (!softmax_switch) {
+    if (!use_softmax) {
       const Tensor* softmax = context.Input<Tensor>("Logits");
       const Tensor* labels = context.Input<Tensor>("Label");
       Tensor* softmax_out = context.Output<Tensor>("Softmax");
@@ -792,11 +791,11 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       auto* softmax_out_data = softmax_out->mutable_data<T>(context.GetPlace());
       auto* loss_data = loss->mutable_data<T>(context.GetPlace());
 
+      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      set_constant(context.cuda_device_context(), loss, static_cast<T>(0));
       if (axis_dim == 1) {
-        math::SetConstant<platform::CUDADeviceContext, T> set_constant;
         set_constant(context.cuda_device_context(), softmax_out,
                      static_cast<T>(1));
-        set_constant(context.cuda_device_context(), loss, static_cast<T>(0));
         return;
       }
 
@@ -926,10 +925,10 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     int block = 512;
     auto stream = context.cuda_device_context().stream();
     auto ignore_index = context.Attr<int>("ignore_index");
-    auto softmax_switch = context.Attr<bool>("softmax_switch");
+    auto use_softmax = context.Attr<bool>("use_softmax");
 
     // do not with softmax op, and input is softmax
-    if (!softmax_switch) {
+    if (!use_softmax) {
       if (context.Attr<bool>("soft_label")) {
         int grid = (n * d + block - 1) / block;
         const T* label_data = labels->data<T>();
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index 55b811cbe31e40bf26ef826b5445bfcaba57bbdc..74316841a13b1771cbe815b6b0180a4747e9df70 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -31,10 +31,10 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(context.GetPlace()), true,
         platform::errors::Unimplemented("This kernel only runs on CPU."));
-    const bool softmax_switch = context.Attr<bool>("softmax_switch");
+    const bool use_softmax = context.Attr<bool>("use_softmax");
 
     // do not with softmax op, and input is softmax
-    if (!softmax_switch) {
+    if (!use_softmax) {
       const Tensor* softmax = context.Input<Tensor>("Logits");
       const Tensor* labels = context.Input<Tensor>("Label");
       Tensor* softmax_out = context.Output<Tensor>("Softmax");
@@ -113,9 +113,9 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
         context.Output<Tensor>(framework::GradVarName("Logits"));
 
     const Tensor* softmax = context.Input<Tensor>("Softmax");
-    const bool softmax_switch = context.Attr<bool>("softmax_switch");
+    const bool use_softmax = context.Attr<bool>("use_softmax");
 
-    if (logit_grad != softmax || !softmax_switch) {
+    if (logit_grad != softmax || !use_softmax) {
       framework::TensorCopy(*softmax, context.GetPlace(),
                             context.device_context(), logit_grad);
     }
@@ -138,8 +138,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     auto logit_grad_mat = framework::EigenMatrix<T>::From(logit_grad_2d);
     auto& place = *context.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
-    if (!softmax_switch) {
-      // softmax_switch step1
+    if (!use_softmax) {
+      // use_softmax step1
       if (soft_label) {
         auto lbl_mat = framework::EigenMatrix<T>::From(labels_2d);
         logit_grad_mat.device(place) =
@@ -148,7 +148,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
             out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim)) *
             logit_grad_mat;
       }
-      // softmax_switch step2
+      // use_softmax step2
       else {
         const int64_t* label_data = labels->data<int64_t>();
         T* logit_grad_data = logit_grad->data<T>();
@@ -181,7 +181,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
       return;
     }
 
-    // for softmax_switch=False, continue
+    // for use_softmax=False, continue
 
     if (soft_label) {
       // when soft_label = True, ignore_index is not supported
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a34946315f5a81d04956735ce5b89b72761a6d0f
--- /dev/null
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -0,0 +1,199 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/softmax.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* logits = ctx.Input<Tensor>("Logits");
+    auto* labels = ctx.Input<Tensor>("Label");
+    auto* softmax = ctx.Output<Tensor>("Softmax");
+    auto* loss = ctx.Output<Tensor>("Loss");
+
+    int cls_num = logits->dims()[1];
+    const int rank = logits->dims().size();
+    const int axis = CanonicalAxis(ctx.Attr<int>("axis"), rank);
+    std::vector<int> axes;
+    for (auto i = axis; i < logits->dims().size(); ++i) {
+      axes.push_back(i);
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // softmax
+    softmax->mutable_data<T>(ctx.GetPlace());
+    auto runner_softmax =
+        NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}});
+    runner_softmax.Run(stream);
+
+    // cast label from int64/int32 to int32
+    Tensor tmp_labels(framework::proto::VarType::INT32);
+    if (labels->type() != framework::proto::VarType::INT32) {
+      tmp_labels.Resize(labels->dims());
+      tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
+      auto runner_cast_label =
+          NpuOpRunner("Cast", {*labels}, {tmp_labels},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_label.Run(stream);
+      labels = &tmp_labels;
+    }
+
+    // on and off
+    Tensor on_tensor(framework::proto::VarType::INT32);
+    on_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
+    Tensor off_tensor(framework::proto::VarType::INT32);
+    off_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
+
+    // one_hot
+    Tensor tmp_onehot(on_tensor.type());
+    tmp_onehot.Resize(logits->dims());
+    tmp_onehot.mutable_data<int>(ctx.GetPlace());
+
+    auto runner_onehot =
+        NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
+                    {{"axis", -1}, {"depth", cls_num}});
+    runner_onehot.Run(stream);
+
+    // cast one_hot from int32 to T
+    Tensor cast_onehot(logits->type());
+    cast_onehot.Resize(tmp_onehot.dims());
+    cast_onehot.mutable_data<T>(ctx.GetPlace());
+    auto dst_dtype = ConvertToNpuDtype(logits->type());
+    auto runner_cast_onehot =
+        NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast_onehot.Run(stream);
+
+    // SoftmaxCrossEntropyWithLogits
+    Tensor backprop(logits->type());
+    backprop.Resize(logits->dims());
+    backprop.mutable_data<T>(ctx.GetPlace());
+
+    loss->mutable_data<T>(ctx.GetPlace());
+
+    // SoftmaxCrossEntropyWithLogits requires loss to be of shape [batch_size]
+    auto loss_dims = loss->dims();
+    loss->Resize({loss_dims[0]});
+    auto runner_s = NpuOpRunner("SoftmaxCrossEntropyWithLogits",
+                                {*logits, cast_onehot}, {*loss, backprop}, {});
+    runner_s.Run(stream);
+    loss->Resize(loss_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* labels = ctx.Input<Tensor>("Label");
+    auto* softmax = ctx.Input<Tensor>("Softmax");
+    auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* logits_grad = ctx.Output<Tensor>(framework::GradVarName("Logits"));
+
+    int cls_num = softmax->dims()[1];
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // cast label from int64/int32 to int32
+    Tensor tmp_labels(framework::proto::VarType::INT32);
+    if (labels->type() != framework::proto::VarType::INT32) {
+      tmp_labels.Resize(labels->dims());
+      tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
+      auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
+      auto runner_cast_label =
+          NpuOpRunner("Cast", {*labels}, {tmp_labels},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast_label.Run(stream);
+      labels = &tmp_labels;
+    }
+
+    // on and off
+    Tensor on_tensor(framework::proto::VarType::INT32);
+    on_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
+    Tensor off_tensor(framework::proto::VarType::INT32);
+    off_tensor.mutable_data<int>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
+
+    // one_hot
+    Tensor tmp_onehot(on_tensor.type());
+    tmp_onehot.Resize(softmax->dims());
+    tmp_onehot.mutable_data<int>(ctx.GetPlace());
+
+    auto runner_onehot =
+        NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
+                    {{"axis", -1}, {"depth", cls_num}});
+    runner_onehot.Run(stream);
+
+    // cast one_hot from int32 to T
+    Tensor cast_onehot(softmax->type());
+    cast_onehot.Resize(tmp_onehot.dims());
+    cast_onehot.mutable_data<T>(ctx.GetPlace());
+    auto dst_dtype = ConvertToNpuDtype(softmax->type());
+    auto runner_cast_onehot =
+        NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner_cast_onehot.Run(stream);
+
+    // sub
+    Tensor tmp_sub(softmax->type());
+    tmp_sub.Resize(softmax->dims());
+    tmp_sub.mutable_data<T>(ctx.GetPlace());
+    auto runner_sub =
+        NpuOpRunner("Sub", {*softmax, cast_onehot}, {tmp_sub}, {});
+
+    runner_sub.Run(stream);
+    // mul
+    logits_grad->mutable_data<T>(ctx.GetPlace());
+    auto runner_mul =
+        NpuOpRunner("Mul", {*loss_grad, tmp_sub}, {*logits_grad}, {});
+    runner_mul.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    softmax_with_cross_entropy,
+    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
+                                          float>,
+    ops::SoftmaxWithCrossEntropyNPUKernel<paddle::platform::NPUDeviceContext,
+                                          paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    softmax_with_cross_entropy_grad,
+    ops::SoftmaxWithCrossEntropyGradNPUKernel<
+        paddle::platform::NPUDeviceContext, float>,
+    ops::SoftmaxWithCrossEntropyGradNPUKernel<
+        paddle::platform::NPUDeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
index 346ed965d06f285f3630f1c49254da9688432333..8635def2ecf138550bf02f0013b31b59647777b9 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc
@@ -45,11 +45,25 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel<T> {
     const int n = SizeToAxis(axis, logits->dims());
     const int d = SizeFromAxis(axis, logits->dims());
     std::vector<int> logits_dims = framework::vectorize<int>(logits->dims());
+
     // softmax
     auto& dev_ctx =
         context.template device_context<platform::XPUDeviceContext>();
-    int r = xpu::softmax(dev_ctx.x_context(), logits->data<float>(),
-                         softmax->data<float>(), logits_dims, axis);
+    int r = XPU_SUCCESS;
+    Tensor clip_logits;
+    int len = logits->numel();
+    T* clip_logits_data =
+        clip_logits.mutable_data<T>(context.GetPlace(), len * sizeof(T));
+    r = xpu::clip(dev_ctx.x_context(), logits->data<float>(), clip_logits_data,
+                  len, -1e30, 1e30);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("XPU kernel error. clip "
+                                   "execution not succeed, error code=%d",
+                                   r));
+
+    r = xpu::softmax(dev_ctx.x_context(), clip_logits_data,
+                     softmax->data<float>(), logits_dims, axis);
 
     PADDLE_ENFORCE_EQ(
         r, xpu::Error_t::SUCCESS,
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
deleted file mode 100644
index 54a35b5cd7df766c8a94e1a82931f3488c7d8c07..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/split_selected_rows_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input SelectedRows.");
-    AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable();
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}));
-
-    AddComment(R"DOC(
-Split a SelectedRows with a specified rows section.
-height_sections is only needed when need to split the dims of the original tensor.
-
-Example:
-  Input:
-    X.rows = {7, 5}
-    X.height = 12
-  Attr:
-    height_sections = {4, 8}
-  Out:
-    out0.rows = {}
-    out0.height = 4
-
-    out1.rows = {5, 7}
-    out2.height = 8
-
-)DOC");
-  }
-};
-
-class SplitSelectedRowsOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "SplitSelectedRowsOp must have input X."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutputs("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "SplitSelectedRowsOp must have output Out."));
-  }
-};
-
-class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
- public:
-  void operator()(framework::InferVarTypeContext *ctx) const override {
-    ctx->SetOutputType("Out", framework::proto::VarType::SELECTED_ROWS,
-                       framework::ALL_ELEMENTS);
-  }
-};
-
-template <typename T>
-class SplitSelectedRowsGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> grad_op) const override {
-    grad_op->SetType("sum");
-    grad_op->SetInput("X", this->OutputGrad("Out"));
-    grad_op->SetOutput("Out", this->InputGrad("X"));
-    grad_op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
-                  ops::SplitSelectedRowsOpMaker,
-                  ops::SplitSelectedRowsGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitSelectedRowsGradMaker<paddle::imperative::OpBase>,
-                  ops::SplitSelectedRowsOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h
deleted file mode 100644
index 281f9fb7e596fc401daa8aab929bada01a8675b6..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::SelectedRows>("X");
-    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
-    auto height_sections = ctx.Attr<std::vector<int64_t>>("height_sections");
-
-    auto abs_sections = ToAbsoluteSection(height_sections);
-
-    auto& x_rows = x->rows();
-    auto height = x->height();
-    std::vector<std::vector<int>> outs_rows_idx;
-    std::vector<std::vector<int>> outs_dense_idx;
-
-    outs_rows_idx.resize(outs.size());
-    outs_dense_idx.resize(outs.size());
-
-    auto row_numel = x->value().numel() / x->value().dims()[0];
-    auto src = x->value().data<T>();
-
-    // split rows index into output sparse vars
-    for (size_t i = 0; i < x_rows.size(); ++i) {
-      auto& id = x_rows[i];
-      PADDLE_ENFORCE_LT(id, height,
-                        platform::errors::OutOfRange(
-                            "Each row_id in x.rows must be less than x.height. "
-                            "But received x.rows[%d] = %d, x.height = %d",
-                            i, id, height));
-      int out_idx = GetSectionIndex(id, abs_sections);
-      outs_rows_idx[out_idx].push_back(id);
-      outs_dense_idx[out_idx].push_back(i);
-    }
-    auto place = ctx.GetPlace();
-
-    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
-      auto rows_idx = outs_rows_idx[i];
-      outs[i]->set_height(height_sections[i]);
-      auto dims = x->GetCompleteDims();
-      dims[0] = rows_idx.size();
-      outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
-      outs[i]->mutable_rows()->clear();
-      if (rows_idx.size() > 0) {
-        for (auto idx : rows_idx) {
-          auto id_offset = idx - abs_sections[i];
-          PADDLE_ENFORCE_LT(
-              id_offset, height_sections[i],
-              platform::errors::OutOfRange("Each row_id in out.rows must be "
-                                           "less than out.height. But recived "
-                                           "out.rows = [%d], out.height = [%d]",
-                                           id_offset, height_sections[i]));
-          outs[i]->mutable_rows()->push_back(id_offset);
-        }
-        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
-        for (size_t j = 0; j < rows_idx.size(); j++) {
-          if (platform::is_cpu_place(place)) {
-            memory::Copy(
-                platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
-                src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
-          } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-            auto stream = ctx.cuda_device_context().stream();
-            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
-                         platform::CUDAPlace(),
-                         src + outs_dense_idx[i][j] * row_numel,
-                         sizeof(T) * row_numel, stream);
-#else
-            PADDLE_THROW(platform::errors::Unavailable(
-                "Paddle is not compiled with CUDA. Cannot visit cuda device"));
-#endif
-          }
-        }
-      }
-      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
-                        platform::errors::InvalidArgument(
-                            "rows should has the same size with tensor dim 0. "
-                            "But received rows = %d, tensor's dim[0] = %d.",
-                            rows_idx.size(), outs[i]->rows().size()));
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squeeze_op_npu.cc b/paddle/fluid/operators/squeeze_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33c9273e3b6f50038a738744d47db1ae246d25f8
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op_npu.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/squeeze_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    squeeze, ops::SqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
+REGISTER_OP_NPU_KERNEL(
+    squeeze2, ops::SqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::SqueezeKernel<plat::NPUDeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1de7ca8c7bdbf44127237afb93d4b4c1bc7f46ab
--- /dev/null
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(squeeze);
+USE_OP_DEVICE_KERNEL(squeeze, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int dim0 = 1;
+  int dim1 = 10;
+  int dim2 = 1;
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < dim0 * dim1 * dim2; ++i) {
+    init.push_back(static_cast<T>(0.1));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({dim0, dim1, dim2});
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  std::vector<int> axis;
+  axis.push_back(2);
+  f::AttributeMap attrs = {{"axes", axis}};
+
+  auto op = f::OpRegistry::CreateOp("squeeze", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(2));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(dim0));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1));
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
+  }
+
+  ctx.Wait();
+}
+
+TEST(squeeze, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h
index 38ab60afd91a41ceb674f8261fa5bec72bbae5f0..03d5324528930c0a16efc3c00b9f6f527289641d 100644
--- a/paddle/fluid/operators/stack_op.h
+++ b/paddle/fluid/operators/stack_op.h
@@ -30,7 +30,7 @@ struct StackGradFunctor {
     int i = idx / (n_ * post_);
     int which_x = idx / post_ - i * n_;
     int x_index = i * post_ + idx % post_;
-    dx_[which_x][x_index] = dy_[idx];
+    if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx];
   }
 
  private:
@@ -95,19 +95,21 @@ class StackGradKernel : public framework::OpKernel<T> {
     auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
     int axis = ctx.Attr<int>("axis");
     if (axis < 0) axis += dy->dims().size();
-
     int n = dy->dims()[axis];
     std::vector<T *> dx_datas(n);  // NOLINT
+
     for (int i = 0; i < n; i++) {
-      dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
+      if (dx[i] == nullptr) {
+        dx_datas[i] = nullptr;
+      } else {
+        dx_datas[i] = dx[i]->mutable_data<T>(ctx.GetPlace());
+      }
     }
     auto dy_data = dy->data<T>();
-
     int pre = 1;
     for (int i = 0; i < axis; ++i) pre *= dy->dims()[i];
     int total_num = dy->numel();
     int post = total_num / (n * pre);
-
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     auto dx_data_arr = dx_datas.data();
     StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post);
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..958655b1f27c680655c20e8f795fc9e4bf37251d
--- /dev/null
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/stack_op.h"
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class StackNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    int32_t N = x.size();
+
+    PADDLE_ENFORCE_GT(
+        N, 0, platform::errors::InvalidArgument("number of input Tensor <= 0"));
+
+    std::vector<paddle::framework::Tensor> x_list;
+    for (int i = 0; i < N; i++) {
+      x_list.push_back(*x[i]);
+    }
+
+    int axis = ctx.Attr<int>("axis");
+
+    if (axis < 0) {
+      axis = axis + x_list[0].dims().size() + 1;
+    }
+    auto* out = ctx.Output<Tensor>("Y");
+
+    auto place = ctx.GetPlace();
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    out->mutable_data<T>(place);
+
+    if (axis != 0) {
+      auto x_dim = x_list[0].dims();
+      std::vector<int> vec_dim_tmp;
+      vec_dim_tmp.push_back(N);
+      for (auto i = 0; i < x_dim.size(); ++i) {
+        vec_dim_tmp.push_back(x_dim[i]);
+      }
+
+      Tensor tmp_stack(out->type());
+      tmp_stack.Resize(framework::make_ddim(vec_dim_tmp));
+      tmp_stack.mutable_data<T>(ctx.GetPlace());
+
+      auto runner =
+          NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}});
+      runner.Run(stream);
+
+      std::vector<int64_t> vec_trans;
+      for (auto i = 1; i <= x_dim.size(); ++i) {
+        vec_trans.push_back(i);
+        if (i == axis) {
+          vec_trans.push_back(0);
+        }
+      }
+
+      auto runner_trans_final =
+          NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}});
+      runner_trans_final.Run(stream);
+
+    } else {
+      auto runner =
+          NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    stack, ops::StackNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::StackNPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
+
+#endif
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 741f86f35848b2e626923e381bf007f351584789..0f520adba57a203fae5d3b34fb67067d01691bed 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL(
     sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext,
+                   paddle::platform::bfloat16>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3dc5faf46c81e71173c6f5a6ad7766067cad1c3
--- /dev/null
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/sum_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SumNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.MultiInput<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto place = ctx.GetPlace();
+
+    int n = static_cast<int>(x.size());
+    PADDLE_ENFORCE_EQ(n > 1, true,
+                      platform::errors::InvalidArgument(
+                          "The size of Input(x) list must larger or equal 2"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
+
+    runner.Run(stream);
+    for (int i = 2; i < n; i++) {
+      runner = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    sum, ops::SumNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SumNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
index e785e89aebaa5cce7c5dab0abb44ef2d7a4393d4..69617b7e208a880a649fb73a54830dd3def1c939 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -186,14 +186,6 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
                    framework::DataLayout::kNHWC><<<grid, threads, 0, stream>>>(
           x_d, N, H * W * D, C, stats);
     }
-    
-    /*
-    Tensor c_g_st;
-    auto *c_g_st_d = c_g_st.mutable_data<BatchNormParamType<T>>(
-        {2 * C + 1}, platform::CPUPlace());
-    auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0);
-    */
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto *comm = dev_ctx.nccl_comm();
diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc
index 2e87447ed166e9b84d80ab50be1a166047c6d103..acf99d09ffb906c859d0f1b3a3411bc8bc6de524 100644
--- a/paddle/fluid/operators/temporal_shift_op.cc
+++ b/paddle/fluid/operators/temporal_shift_op.cc
@@ -80,7 +80,8 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X",
              "The input tensor of temporal shift operator. "
-             "This is a 4-D tensor with shape of [N*T,  C, H, W]. "
+             "This is a 4-D tensor with shape of [N*T, C, H, W] "
+             "or [N*T, H, W, C]. "
              "While N is the batch size, T is the temporal segment "
              "number, C is the channel number, H is the height of "
              "features and W is the width of features. "
@@ -100,15 +101,23 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker {
         "by 1 along the temporal dimension. :attr:`shift_ratio` should be in "
         "range [0, 0.5]. Default 0.25.")
         .SetDefault(0.25);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "an optional string from: \"NHWC\", \"NCHW\". "
+        "Specify that the data format of the input and output data is "
+        "channel_first or channel_last.")
+        .SetDefault("NCHW");
 
     AddComment(R"DOC(
           This operator calculates the temporal shifting features for Input(X).
 
-          Input(X) should be in shape of [N*T, C, H, W], while N is the batch
-          size, T is the temporal segment number specified by :attr:`seg_num`, 
-          C is the channel number, H and W is the height and width of features.
+          Input(X) should be in shape of [N*T, C, H, W] or [N*T, H, W, C], while 
+          N is the batch size, T is the temporal segment number specified by 
+          :attr:`seg_num`, C is the channel number, H and W is the height and 
+          width of features.
 
-          Temporal Shifting is calculated as follows:
+          Temporal Shifting is calculated as follows when data format is NCHW:
           
           Step 1: Reshape Input(X) to [N, T, C, H, W].
 
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index 4f2d7ce3cff9e18b0d0a247d892f834984d9ad3c..cb1ff5335cdf04db3f6a6920ee28254e330cf432 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -19,22 +19,46 @@ namespace operators {
 using framework::Tensor;
 
 template <typename T>
-__global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw,
-                                  const int tchw, const int chw, const int hw,
-                                  const int w, const int t, const int c,
-                                  const float shift_ratio) {
+__global__ void KeTemporalShiftFwNCHW(const T* input, T* output,
+                                      const int ntchw, const int tchw,
+                                      const int chw, const int hw, const int t,
+                                      const int c1, const int c2) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   int src_it = 0;
+
   for (; tid < ntchw; tid += stride) {
-    int in = tid / tchw;
     int it = (tid % tchw) / chw;
     int ic = (tid % chw) / hw;
-    int ih = (tid % hw) / w;
-    int iw = tid % w;
 
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[tid] = 0;
+    } else {
+      output[tid] = input[tid + (src_it - it) * chw];
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTemporalShiftFwNHWC(const T* input, T* output,
+                                      const int nthwc, const int thwc,
+                                      const int hwc, const int t, const int c,
+                                      const int c1, const int c2) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+
+  for (; tid < nthwc; tid += stride) {
+    int it = (tid % thwc) / hwc;
+    int ic = tid % c;
 
     if (ic < c1) {
       src_it = it - 1;
@@ -47,42 +71,65 @@ __global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw,
     if (src_it < 0 || src_it >= t) {
       output[tid] = 0;
     } else {
-      int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-      output[tid] = input[src_idx];
+      output[tid] = input[tid + (src_it - it) * hwc];
     }
   }
 }
 
 template <typename T>
-__global__ void KeTemporalShiftBw(const T* output_grad, T* input_grad,
-                                  const int ntchw, const int tchw,
-                                  const int chw, const int hw, const int w,
-                                  const int t, const int c,
-                                  const float shift_ratio) {
+__global__ void KeTemporalShiftBwNCHW(const T* output_grad, T* input_grad,
+                                      const int ntchw, const int tchw,
+                                      const int chw, const int hw, const int t,
+                                      const int c1, const int c2) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   int src_it = 0;
+
   for (; tid < ntchw; tid += stride) {
-    int in = tid / tchw;
     int it = (tid % tchw) / chw;
     int ic = (tid % chw) / hw;
-    int ih = (tid % hw) / w;
-    int iw = tid % w;
-
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
     if (ic < c1) {
-      src_it = it - 1;
+      src_it = it + 1;
     } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it >= 0 && src_it < t) {
+      input_grad[tid] = output_grad[tid + (src_it - it) * chw];
+    } else {
+      input_grad[tid] = 0;
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeTemporalShiftBwNHWC(const T* output_grad, T* input_grad,
+                                      const int nthwc, const int thwc,
+                                      const int hwc, const int t, const int c,
+                                      const int c1, const int c2) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int src_it = 0;
+
+  for (; tid < nthwc; tid += stride) {
+    int it = (tid % thwc) / hwc;
+    int ic = tid % c;
+
+    if (ic < c1) {
       src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
     } else {
       src_it = it;
     }
 
     if (src_it >= 0 && src_it < t) {
-      int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-      input_grad[src_idx] = output_grad[tid];
+      input_grad[tid] = output_grad[tid + (src_it - it) * hwc];
+    } else {
+      input_grad[tid] = 0;
     }
   }
 }
@@ -98,27 +145,48 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<Tensor>("Out");
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
 
     const int nt = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
+    const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1]
+                                                    : input->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2]
+                                                    : input->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3]
+                                                    : input->dims()[2]);
 
     const int hw = h * w;
     const int chw = c * hw;
     const int tchw = t * chw;
     const int ntchw = nt * chw;
 
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+
+    framework::DDim out_dims = (data_layout == DataLayout::kNCHW
+                                    ? framework::make_ddim({nt, c, h, w})
+                                    : framework::make_ddim({nt, h, w, c}));
     const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
+    T* output_data = output->mutable_data<T>(out_dims, ctx.GetPlace());
 
     int pixelNum = nt * chw;
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
+    int threads = 1024;
+    int grid = (pixelNum + threads - 1) / threads;
+    const auto& dev_ctx = ctx.cuda_device_context();
+    int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
+    grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
 
-    KeTemporalShiftFw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                           ctx.cuda_device_context().stream()>>>(
-        input_data, output_data, ntchw, tchw, chw, hw, w, t, c, shift_ratio);
+    if (data_layout == DataLayout::kNCHW) {
+      KeTemporalShiftFwNCHW<
+          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2);
+    } else {
+      KeTemporalShiftFwNHWC<
+          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          input_data, output_data, ntchw, tchw, chw, t, c, c1, c2);
+    }
   }
 };
 
@@ -130,32 +198,49 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
 
     const int nt = output_grad->dims()[0];
-    const int c = output_grad->dims()[1];
-    const int h = output_grad->dims()[2];
-    const int w = output_grad->dims()[3];
+    const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
+                                                    : output_grad->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
+                                                    : output_grad->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
+                                                    : output_grad->dims()[2]);
 
     const int hw = h * w;
     const int chw = c * hw;
     const int tchw = t * chw;
     const int ntchw = nt * chw;
 
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+
+    framework::DDim in_grad_dims = (data_layout == DataLayout::kNCHW
+                                        ? framework::make_ddim({nt, c, h, w})
+                                        : framework::make_ddim({nt, h, w, c}));
     const T* output_grad_data = output_grad->data<T>();
     T* input_grad_data =
-        input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T>()(
-        ctx.template device_context<platform::CUDADeviceContext>(), input_grad,
-        static_cast<T>(0));
+        input_grad->mutable_data<T>(in_grad_dims, ctx.GetPlace());
 
     int pixelNum = nt * chw;
-    platform::GpuLaunchConfig config =
-        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
+    int threads = 1024;
+    int grid = (pixelNum + threads - 1) / threads;
+    const auto& dev_ctx = ctx.cuda_device_context();
+    int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads;
+    grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid);
 
-    KeTemporalShiftBw<T><<<config.block_per_grid, config.thread_per_block, 0,
-                           ctx.cuda_device_context().stream()>>>(
-        output_grad_data, input_grad_data, ntchw, tchw, chw, hw, w, t, c,
-        shift_ratio);
+    if (data_layout == DataLayout::kNCHW) {
+      KeTemporalShiftBwNCHW<
+          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2);
+    } else {
+      KeTemporalShiftBwNHWC<
+          T><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h
index 4c7eed5af471a18768eda6597472c0ad592ccbd0..05364b94c92c67fdcab996f0c3799513f35edee6 100644
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -17,12 +17,106 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
 
-static HOSTDEVICE inline int GetEntryIndex(int in, int it, int ic, int ih,
-                                           int iw, const int tchw,
-                                           const int chw, const int hw,
-                                           const int w) {
-  return in * tchw + it * chw + ic * hw + ih * w + iw;
+template <typename T>
+void TemporalShiftFwNCHW(const T* input, T* output, const int ntchw,
+                         const int tchw, const int chw, const int hw,
+                         const int t, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < ntchw; i++) {
+    int it = (i % tchw) / chw;
+    int ic = (i % chw) / hw;
+
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * chw];
+    }
+  }
+}
+
+template <typename T>
+void TemporalShiftFwNHWC(const T* input, T* output, const int nthwc,
+                         const int thwc, const int hwc, const int t,
+                         const int c, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < nthwc; i++) {
+    int it = (i % thwc) / hwc;
+    int ic = i % c;
+
+    if (ic < c1) {
+      src_it = it - 1;
+    } else if (ic < c2) {
+      src_it = it + 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it < 0 || src_it >= t) {
+      output[i] = 0;
+    } else {
+      output[i] = input[i + (src_it - it) * hwc];
+    }
+  }
+}
+
+template <typename T>
+void TemporalShiftBwNCHW(const T* output_grad, T* input_grad, const int ntchw,
+                         const int tchw, const int chw, const int hw,
+                         const int t, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < ntchw; i++) {
+    int it = (i % tchw) / chw;
+    int ic = (i % chw) / hw;
+
+    if (ic < c1) {
+      src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it >= 0 && src_it < t) {
+      input_grad[i] = output_grad[i + (src_it - it) * chw];
+    } else {
+      input_grad[i] = 0;
+    }
+  }
+}
+
+template <typename T>
+void TemporalShiftBwNHWC(const T* output_grad, T* input_grad, const int nthwc,
+                         const int thwc, const int hwc, const int t,
+                         const int c, const int c1, const int c2) {
+  int src_it = 0;
+  for (int i = 0; i < nthwc; i++) {
+    int it = (i % thwc) / hwc;
+    int ic = i % c;
+
+    if (ic < c1) {
+      src_it = it + 1;
+    } else if (ic < c2) {
+      src_it = it - 1;
+    } else {
+      src_it = it;
+    }
+
+    if (src_it >= 0 && src_it < t) {
+      input_grad[i] = output_grad[i + (src_it - it) * hwc];
+    } else {
+      input_grad[i] = 0;
+    }
+  }
 }
 
 template <typename T>
@@ -33,44 +127,38 @@ class TemporalShiftKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<Tensor>("Out");
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
 
     const int nt = input->dims()[0];
-    const int c = input->dims()[1];
-    const int h = input->dims()[2];
-    const int w = input->dims()[3];
-
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+    const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1]
+                                                    : input->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2]
+                                                    : input->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3]
+                                                    : input->dims()[2]);
 
     const int hw = h * w;
     const int chw = c * hw;
     const int tchw = t * chw;
+    const int ntchw = nt * chw;
 
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+
+    framework::DDim out_dims = (data_layout == DataLayout::kNCHW
+                                    ? framework::make_ddim({nt, c, h, w})
+                                    : framework::make_ddim({nt, h, w, c}));
     const T* input_data = input->data<T>();
-    T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-
-    int src_it = 0;
-    for (int i = 0; i < output->numel(); i++) {
-      int in = i / tchw;
-      int it = (i % tchw) / chw;
-      int ic = (i % chw) / hw;
-      int ih = (i % hw) / w;
-      int iw = i % w;
-
-      if (ic < c1) {
-        src_it = it - 1;
-      } else if (ic < c2) {
-        src_it = it + 1;
-      } else {
-        src_it = it;
-      }
-
-      if (src_it < 0 || src_it >= t) {
-        output_data[i] = 0;
-      } else {
-        int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-        output_data[i] = input_data[src_idx];
-      }
+    T* output_data = output->mutable_data<T>(out_dims, ctx.GetPlace());
+
+    if (data_layout == DataLayout::kNCHW) {
+      TemporalShiftFwNCHW<T>(input_data, output_data, ntchw, tchw, chw, hw, t,
+                             c1, c2);
+    } else {
+      TemporalShiftFwNHWC<T>(input_data, output_data, ntchw, tchw, chw, t, c,
+                             c1, c2);
     }
   }
 };
@@ -83,44 +171,39 @@ class TemporalShiftGradKernel : public framework::OpKernel<T> {
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     int t = ctx.Attr<int>("seg_num");
     float shift_ratio = ctx.Attr<float>("shift_ratio");
+    const std::string data_format_str = ctx.Attr<std::string>("data_format");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_format_str);
 
     const int nt = output_grad->dims()[0];
-    const int c = output_grad->dims()[1];
-    const int h = output_grad->dims()[2];
-    const int w = output_grad->dims()[3];
-
-    const int c1 = static_cast<int>(c * shift_ratio);
-    const int c2 = static_cast<int>(c * 2 * shift_ratio);
+    const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1]
+                                                    : output_grad->dims()[3]);
+    const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2]
+                                                    : output_grad->dims()[1]);
+    const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3]
+                                                    : output_grad->dims()[2]);
 
     const int hw = h * w;
     const int chw = c * hw;
     const int tchw = t * chw;
+    const int ntchw = nt * chw;
+
+    const int c1 = static_cast<int>(c * shift_ratio);
+    const int c2 = static_cast<int>(c * 2 * shift_ratio);
 
+    framework::DDim in_grad_dims = (data_layout == DataLayout::kNCHW
+                                        ? framework::make_ddim({nt, c, h, w})
+                                        : framework::make_ddim({nt, h, w, c}));
     const T* output_grad_data = output_grad->data<T>();
     T* input_grad_data =
-        input_grad->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
-    memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
-
-    int src_it = 0;
-    for (int i = 0; i < output_grad->numel(); i++) {
-      int in = i / tchw;
-      int it = (i % tchw) / chw;
-      int ic = (i % chw) / hw;
-      int ih = (i % hw) / w;
-      int iw = i % w;
-
-      if (ic < c1) {
-        src_it = it - 1;
-      } else if (ic < c2) {
-        src_it = it + 1;
-      } else {
-        src_it = it;
-      }
-
-      if (src_it >= 0 && src_it < t) {
-        int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w);
-        input_grad_data[src_idx] = output_grad_data[i];
-      }
+        input_grad->mutable_data<T>(in_grad_dims, ctx.GetPlace());
+
+    if (data_layout == DataLayout::kNCHW) {
+      TemporalShiftBwNCHW<T>(output_grad_data, input_grad_data, ntchw, tchw,
+                             chw, hw, t, c1, c2);
+    } else {
+      TemporalShiftBwNHWC<T>(output_grad_data, input_grad_data, ntchw, tchw,
+                             chw, t, c, c1, c2);
     }
   }
 };
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 620231eb2e298480665cf4eec316f034e0cf1d1c..eb20e1c2cd2748a5ab4db28df0c4798837c7bf21 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -250,8 +250,12 @@ class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
     auto dout_name = Input(framework::GradVarName("Out"));
 
     std::vector<std::string> grad_names;
+    // NOTE(Aurelius84): Generating grad base name by Input("X") instead of
+    // fixed string to avoid incorrectly sharing same var's allocation in
+    // multi-thread that will cause wrong calculation result.
+    std::string grad_base_name = base_name + "_temp_grad_";
 
-    LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"),
+    LodTensorVectorResizeFromLodTensorArray(scope, grad_base_name, Input("X"),
                                             &grad_names);
 
     auto use_stack = Attr<bool>("use_stack");
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index 046ae90ec7c6e042ba5185f6a9bb81ee38560e11..f1b64f042c3c0996c65c35f2741db1d30ffe6d82 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -125,6 +125,11 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor,
     framework::LoDTensor cpu_tensor;
     platform::CPUPlace cpu_place;
     TensorCopy(print_tensor, cpu_place, &cpu_tensor);
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(print_tensor.place())) {
+      platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
+    }
+#endif
     data = cpu_tensor.data<T>();
   }
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index b8805c025a768e3f0d4565d08e176dfe904b42ef..1f3029d94b940fb514cc04cabe5f41b443b096dd 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -76,6 +76,54 @@ static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
           model_input_shape_str, runtime_input_shape_str));
 }
 
+static void RuntimeDynamicShapeCheck(
+    const std::string &x, const std::vector<int64_t> &runtime_input_shape,
+    const std::vector<int> &min_input_shape,
+    const std::vector<int> &max_input_shape) {
+  PADDLE_ENFORCE_EQ(runtime_input_shape.size(), min_input_shape.size(),
+                    platform::errors::InvalidArgument(
+                        "TRT engine runtime input dims size(%d) inconsistent "
+                        "with the dynamic shape size(%d)",
+                        runtime_input_shape.size(), min_input_shape.size()));
+  auto is_input_shape_valid = [&](
+      const std::vector<int64_t> &runtime_input_shape,
+      const std::vector<int> &min_input_shape,
+      const std::vector<int> &max_input_shape) -> bool {
+    for (size_t i = 0; i < runtime_input_shape.size(); i++) {
+      if (runtime_input_shape[i] <= max_input_shape[i] &&
+          runtime_input_shape[i] >= min_input_shape[i]) {
+        continue;
+      } else {
+        return false;
+      }
+    }
+    return true;
+  };
+  auto comma_fold = [](std::string a, int b) {
+    return std::move(a) + ", " + std::to_string(b);
+  };
+  std::string runtime_input_shape_str = std::accumulate(
+      std::next(runtime_input_shape.begin()), runtime_input_shape.end(),
+      std::to_string(runtime_input_shape[0]), comma_fold);
+  std::string min_input_shape_str =
+      std::accumulate(std::next(min_input_shape.begin()), min_input_shape.end(),
+                      std::to_string(min_input_shape[0]), comma_fold);
+  std::string max_input_shape_str =
+      std::accumulate(std::next(max_input_shape.begin()), max_input_shape.end(),
+                      std::to_string(max_input_shape[0]), comma_fold);
+  PADDLE_ENFORCE_EQ(is_input_shape_valid(runtime_input_shape, min_input_shape,
+                                         max_input_shape),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "TRT runtime input shape of %s is invalid. Expect "
+                        "runtime input shape to be within min/max input shape "
+                        "configured in SetTRTDynamicShapeInfo(),"
+                        "but got runtime input shape = [%s], min input shape = "
+                        "[%s], max input shape = [%s].",
+                        x, runtime_input_shape_str, min_input_shape_str,
+                        max_input_shape_str));
+}
+
 class TensorRTEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> input_names_;
@@ -89,6 +137,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
   bool use_calib_mode_;
   std::string calibration_data_;
   std::string engine_key_;
+  std::string calibration_engine_key_;
   bool calibration_mode_;
   int predictor_id_;
   int device_id_;
@@ -109,6 +158,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
     use_calib_mode_ = Attr<bool>("use_calib_mode");
     calibration_data_ = Attr<std::string>("calibration_data");
     engine_key_ = Attr<std::string>("engine_key");
+    calibration_engine_key_ = Attr<std::string>("calibration_engine_key");
     predictor_id_ = Attr<int>("predictor_id");
 
     auto params = Attr<std::vector<std::string>>("parameters");
@@ -172,9 +222,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
                             "Paddle TRT int8...";
 
     int runtime_batch = 1;
-    if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
+    if (!Singleton<TRTCalibratorEngineManager>::Global().Has(
+            calibration_engine_key_)) {
       TRTCalibratorEngine *calib_res =
-          Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
+          Singleton<TRTCalibratorEngineManager>::Global().Create(
+              calibration_engine_key_);
       std::unordered_map<std::string, size_t> calib_buffers;
       for (auto &x : input_names_) {
         if (param_names_.count(x)) continue;
@@ -185,7 +237,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
         runtime_batch = t_shape[0];
       }
       calib_res->calib_.reset(new TRTInt8Calibrator(
-          calib_buffers, runtime_batch, engine_key_, dev_place));
+          calib_buffers, runtime_batch, calibration_engine_key_, dev_place));
       calib_res->thr_.reset(new std::thread([&]() {
         calib_res->engine_.reset(new TensorRTEngine(
             max_batch_size_, workspace_size_, precision_mode_,
@@ -198,7 +250,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
 
     TRTInt8Calibrator *temp_calibrator =
         Singleton<TRTCalibratorEngineManager>::Global()
-            .Get(engine_key_)
+            .Get(calibration_engine_key_)
             ->calib_.get();
     std::unordered_map<std::string, void *> calib_data;
 
@@ -268,6 +320,22 @@ class TensorRTEngineOp : public framework::OperatorBase {
         }
       } else {
 #if IS_TRT_VERSION_GE(6000)
+        std::map<std::string, std::vector<int>> min_input_shape =
+            engine->min_input_shape();
+        std::map<std::string, std::vector<int>> max_input_shape =
+            engine->max_input_shape();
+        PADDLE_ENFORCE_EQ(
+            min_input_shape.count(x), true,
+            platform::errors::InvalidArgument(
+                "Input %s not found in TRT engine min_input_shape.", x));
+        PADDLE_ENFORCE_EQ(
+            max_input_shape.count(x), true,
+            platform::errors::InvalidArgument(
+                "Input %s not found in TRT engine max_input_shape.", x));
+        auto x_min_input_shape = min_input_shape[x];
+        auto x_max_input_shape = max_input_shape[x];
+        RuntimeDynamicShapeCheck(x, t_shape, x_min_input_shape,
+                                 x_max_input_shape);
         auto *trt_context = engine->context();
         trt_context->setBindingDimensions(
             bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 1dcaccd6e926411c37acbaa0f58d0b3eb1438f3a..4e88d79dfe4d248de61a0b2a690bf283826944ea 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -102,6 +102,8 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
   engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
   engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("a_calib_engine"));
   engine_op_desc.SetAttr("predictor_id", 1);
   engine_op_desc.SetAttr("calibration_data", std::string(""));
   engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
@@ -204,6 +206,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetAttr("parameters",
                          std::vector<std::string>({"y0", "y1", "y2", "y3"}));
   engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("b_calib_engine"));
   engine_op_desc.SetAttr("predictor_id", 1);
   engine_op_desc.SetAttr("calibration_data", std::string(""));
   engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index 6527362bb969072b3a41cb8f74e433f030a31af0..b98e620cc2d342d488921d71531d09386940f46b 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -286,3 +286,20 @@ REGISTER_OP_CPU_KERNEL(
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::float16>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/tile_op.cu b/paddle/fluid/operators/tile_op.cu
deleted file mode 100644
index 5ca82cd6a1f43551cb4d461bc47e962abd097a9a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/tile_op.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/tile_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
old mode 100644
new mode 100755
index dffd3e586417705601841899e055a0dbb51d630f..1fb0fa6ce5176f31cd1c2c3314152f711fb55355
--- a/paddle/fluid/operators/tile_op.h
+++ b/paddle/fluid/operators/tile_op.h
@@ -26,9 +26,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-
+// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
+//    Usage: BOOST_PP_REPEAT(count, macro, data).
+//    This macro expands to the sequence:
+//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
+// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
+//    So the range of n is 0-5(which is count-1).
+//    We want to generate case 1-6 instead of case 0-5.
+//    So we need to change n to n + 1.
 #define TILE_TEMPLATE(z, n, data) \
   case n + 1: {                   \
     Tile<n + 1>(context);         \
@@ -36,10 +44,10 @@ limitations under the License. */
   }
 #define REP_TILE_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_TEMPLATE, ~)
 #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define TILE_GRAD_CASE(n)                                        \
-  case n: {                                                      \
-    TileBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                       \
+#define TILE_GRAD_CASE(n)                                            \
+  case n + 1: {                                                      \
+    TileBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                           \
   }
 #define TILE_GRAD_TEMPLATE(z, n, data) BOOST_PP_IF(COND(n), TILE_GRAD_CASE(n), )
 #define REP_TILE_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_GRAD_TEMPLATE, ~)
@@ -155,7 +163,7 @@ class TileKernel : public framework::OpKernel<T> {
             "'repeat_times' for tile op must match after promotion.",
             vec_in_dims.size(), repeat_times.size()));
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       bcast_dims[i] = repeat_times[i];
     }
@@ -175,9 +183,11 @@ class TileKernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -240,7 +250,14 @@ class TileGradKernel : public framework::OpKernel<T> {
                             "must be less than or equal "
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
-      switch (dims) { REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) }
+      switch (dims) {
+        REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        default:
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support tensor with rank being between 1 and 6. But "
+              "received tensor's rank = %d.",
+              dims));
+      }
     }
   }
 
@@ -255,21 +272,20 @@ class TileGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
 
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..684bd476b6ef21bf58a990c36b1ee6f820d82caf
--- /dev/null
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -0,0 +1,85 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/top_k_op.h"
+
+namespace paddle {
+namespace operators {
+
+void gen_assist_seq(framework::Tensor* assit_tensor, int64_t dim,
+                    const framework::ExecutionContext& ctx) {
+  const int64_t dimx2 = dim;
+  std::vector<paddle::platform::float16> assit;
+  assit.resize(2 * dimx2);
+  for (int64_t i = 0; i < dimx2; i++) {
+    // for i in range [0, dim]
+    assit[i] = static_cast<paddle::platform::float16>(i);
+
+    // for i in range [dim, dimx2]
+    int64_t idx =
+        static_cast<int64_t>(static_cast<paddle::platform::float16>(i));
+    int64_t gap = i - idx;
+    assit[i + dim] = static_cast<paddle::platform::float16>(gap);
+  }
+  framework::TensorFromVector(assit, ctx.device_context(), assit_tensor);
+}
+
+template <typename DeviceContext, typename T>
+class TopkNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // read input
+    auto* input = ctx.Input<framework::LoDTensor>("X");
+    auto* output = ctx.Output<framework::LoDTensor>("Out");
+    auto* indices = ctx.Output<framework::LoDTensor>("Indices");
+
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+
+    output->mutable_data<T>(ctx.GetPlace());
+    indices->mutable_data<int>(ctx.GetPlace());
+
+    // prepare assit
+    auto dim = input->dims().size();
+    framework::Tensor assist_seq_tensor;
+    assist_seq_tensor.Resize({2 * dim});
+    assist_seq_tensor.mutable_data<T>(ctx.GetPlace());
+    gen_assist_seq(&assist_seq_tensor, dim, ctx);
+
+    framework::NPUAttributeMap attr_input = {{"sorted", "true"},
+                                             {"k", static_cast<int>(k)},
+                                             {"dim", -1},
+                                             {"largest", true}};
+
+    // run ascend
+    auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
+                              {*output, *indices}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+// Ascend Op TopKD only support input float 16 dtype
+REGISTER_OP_NPU_KERNEL(top_k,
+                       ops::TopkNPUKernel<paddle::platform::NPUDeviceContext,
+                                          paddle::platform::float16>);
diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu
index ea328361ded75ade9228fffe4dee0b4c6f0fc3e6..2c2745018be40255cd35585b06303506cf4dd386 100644
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
 #include "paddle/fluid/operators/trace_op.h"
 
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..994b8e534f85e2926481d3767f6e75892751d959
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/expand_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class TransposeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    framework::NPUAttributeMap attr_input = {{"perm", axis}};
+    out->mutable_data<T>(ctx.device_context().GetPlace());
+    auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class TransposeGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
+    auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    transpose2,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>);
+
+REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel<float>,
+                       ops::TransposeGradNPUKernel<paddle::platform::float16>,
+                       ops::TransposeGradNPUKernel<int>,
+                       ops::TransposeGradNPUKernel<uint8_t>,
+                       ops::TransposeGradNPUKernel<int8_t>);
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f6712814e1e3b83785eb83b3190d6c8e5fcb14ec
--- /dev/null
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -0,0 +1,137 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <cmath>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(transpose2);
+USE_OP_DEVICE_KERNEL(transpose2, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto out = scope->Var("Out");
+  auto xshape = scope->Var("XShape");
+  auto* x_t = x->GetMutable<f::LoDTensor>();
+  auto* out_t = out->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
+  auto place = ctx.GetPlace();
+
+  int dim0 = 2;
+  int dim1 = 3;
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, x_t);
+  ctx.Wait();
+  x_t->Resize({dim0, dim1});
+  out_t->Resize({dim0, dim1});
+  ctx.Wait();
+  out_t->mutable_data<T>(place);
+  ctx.Wait();
+  xshape_t->Resize({dim0, dim1});
+  xshape_t->mutable_data<T>(place);
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+  auto op = f::OpRegistry::CreateOp("transpose2", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}, {"XShape", {"XShape"}}},
+                                    attrs);
+  ctx.Wait();
+  op->Run(*scope, place);
+  ctx.Wait();
+  std::vector<T> out_v;
+  TensorToVector(*out_t, ctx, &out_v);
+  ctx.Wait();
+
+  EXPECT_EQ(out_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto xshape = scope->Var("XShape");
+  auto x_grad = scope->Var("X@GRAD");
+  auto out_grad = scope->Var("Out@GRAD");
+
+  auto* x_grad_t = x_grad->GetMutable<f::LoDTensor>();
+  auto* xshape_t = xshape->GetMutable<f::LoDTensor>();
+  auto* out_grad_t = out_grad->GetMutable<f::LoDTensor>();
+
+  int dim0 = 2;
+  int dim1 = 3;
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(std::vector<T>({0, 1, 2, 3, 4, 5}), ctx, out_grad_t);
+  ctx.Wait();
+
+  x_grad_t->Resize({dim0, dim1});
+  xshape_t->Resize(
+      {0, dim0,
+       dim1});  // NOTE(zhiqiu): 0 is needed, see its infershape function
+  out_grad_t->Resize({dim0, dim1});
+
+  f::AttributeMap attrs = {{"axis", std::vector<int>({1, 0})},
+                           {"data_format", std::string("AnyLayout")}};
+
+  auto op = f::OpRegistry::CreateOp(
+      "transpose2_grad", {{"Out@GRAD", {"Out@GRAD"}}, {"XShape", {"XShape"}}},
+      {{"X@GRAD", {"X@GRAD"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+  std::vector<T> out_v;
+  TensorToVector(*x_grad_t, ctx, &out_v);
+  ctx.Wait();
+
+  EXPECT_EQ(x_grad_t->numel(), dim0 * dim1);
+  EXPECT_EQ(out_v[0], 0);
+  EXPECT_EQ(out_v[1], 3);
+  EXPECT_EQ(out_v[2], 1);
+  EXPECT_EQ(out_v[3], 4);
+  EXPECT_EQ(out_v[4], 2);
+  EXPECT_EQ(out_v[5], 5);
+}
+
+TEST(transpose2, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
+
+TEST(transpose2_grad, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
index 798709b1088d3f45caa3fbc441444ed4d3123591..1f25a8807589232beec890e2833d437746bb3fc0 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -12,25 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include <limits>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-struct TruncatedNormal {
+struct GPUTruncatedNormal {
   T mean, std;
   T a_normal_cdf;
   T b_normal_cdf;
   unsigned int seed;
   T numeric_min;
 
-  __host__ __device__ TruncatedNormal(T mean, T std, T numeric_min, int seed)
+  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
       : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
     a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
     b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
@@ -110,10 +113,10 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
           TruncatedNormalOffset<T>(mean, std, std::numeric_limits<T>::min(),
                                    seed_offset.first, gen_offset));
     } else {
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          TruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GPUTruncatedNormal<T>(
+                            mean, std, std::numeric_limits<T>::min(), seed));
     }
   }
 };
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f3190d9112c66a09b1a5c7432a06b6e4a4ead6f
--- /dev/null
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
+#include <memory>
+#include <string>
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // TODO(zhiqiu): support dynamic shape and call ParameterizedTruncatedNormal
+    std::vector<int> shape = ctx.Attr<std::vector<int>>("shape");
+    Tensor shape_tensor(framework::proto::VarType::INT32);
+    shape_tensor.mutable_data<int32_t>({static_cast<int>(shape.size())},
+                                       ctx.GetPlace());
+    TensorFromVector(shape, ctx.device_context(), &shape_tensor);
+    float mean = ctx.Attr<float>("mean");
+    Tensor mean_tensor(framework::proto::VarType::FP32);
+    mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<float>(&mean_tensor, mean);
+
+    float std = ctx.Attr<float>("std");
+    Tensor std_tensor(framework::proto::VarType::FP32);
+    std_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    FillNpuTensorWithConstant<float>(&std_tensor, std);
+
+    int32_t seed_var = ctx.Attr<int32_t>("seed");
+
+    Tensor min_tensor(framework::proto::VarType::FP32);
+    min_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    float min_value = mean - std * 2.0;
+    FillNpuTensorWithConstant<float>(&min_tensor, min_value);
+
+    Tensor max_tensor(framework::proto::VarType::FP32);
+    max_tensor.mutable_data<float>({1}, ctx.GetPlace());
+    float max_value = mean + std * 2.0;
+    FillNpuTensorWithConstant<float>(&max_tensor, max_value);
+
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    auto runner = NpuOpRunner(
+        "ParameterizedTruncatedNormal",
+        {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out},
+        {{"seed", seed_var}});
+    runner.Run(stream);
+  }
+};
+
+// NOTE(zhiqiu): actually, this is cpu version kernel, and we need to make the
+// above
+// npu version work in the future.
+template <typename T>
+class NPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    tensor->mutable_data<T>(context.GetPlace());
+
+    Tensor cpu_tensor(tensor->type());
+    cpu_tensor.Resize(tensor->dims());
+    T* cpu_data = cpu_tensor.mutable_data<T>(platform::CPUPlace());
+    std::uniform_real_distribution<T> dist(std::numeric_limits<float>::min(),
+                                           1.0);
+    TruncatedNormal<T> truncated_normal(mean, std);
+    int64_t size = tensor->numel();
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    auto engine = framework::GetCPURandomEngine(seed);
+    for (int64_t i = 0; i < size; ++i) {
+      cpu_data[i] = truncated_normal(dist(*engine));
+    }
+    framework::TensorCopy(
+        cpu_tensor, context.GetPlace(),
+        context.template device_context<platform::DeviceContext>(), tensor);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(truncated_gaussian_random,
+                       ops::NPUTruncatedGaussianRandomKernel<float>);
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 6efada4343ca54c0d56f98cae20963bf0182f47b..007276b16d7f2e4d184094f97a20f138b14faa37 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -18,10 +18,41 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
 namespace operators {
 
+namespace {
+template <typename T>
+inline void UniformRealDistribution(T *data, const int64_t &size,
+                                    const float &min, const float &max,
+                                    const unsigned int &seed) {
+  VLOG(4) << "[CPU] UniformRandomKernel<T>";
+  std::uniform_real_distribution<T> dist(static_cast<T>(min),
+                                         static_cast<T>(max));
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <>
+inline void UniformRealDistribution(paddle::platform::bfloat16 *data,
+                                    const int64_t &size, const float &min,
+                                    const float &max,
+                                    const unsigned int &seed) {
+  VLOG(4) << "[CPU] UniformRandomKernel<bfloat16>";
+  std::uniform_real_distribution<float> dist(min, max);
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = static_cast<paddle::platform::bfloat16>(dist(*engine));
+  }
+}
+}  // namespace
+
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -61,17 +92,11 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
           framework::ToTypeName(out_var->Type())));
     }
     T *data = tensor->mutable_data<T>(ctx.GetPlace());
-
     int64_t size = tensor->numel();
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(ctx.Attr<float>("min")),
-        static_cast<T>(ctx.Attr<float>("max")));
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
 
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
+    UniformRealDistribution<T>(
+        data, size, ctx.Attr<float>("min"), ctx.Attr<float>("max"),
+        static_cast<unsigned int>(ctx.Attr<int>("seed")));
 
     unsigned int diag_num =
         static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
@@ -257,9 +282,12 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::UniformRandomOpVarTypeInference);
 
-REGISTER_OP_CPU_KERNEL(uniform_random,
-                       paddle::operators::CPUUniformRandomKernel<float>,
-                       paddle::operators::CPUUniformRandomKernel<double>);
-REGISTER_OP_CPU_KERNEL(uniform_random_batch_size_like,
-                       paddle::operators::CPUUniformRandomKernel<float>,
-                       paddle::operators::CPUUniformRandomKernel<double>);
+REGISTER_OP_CPU_KERNEL(
+    uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
+    paddle::operators::CPUUniformRandomKernel<double>,
+    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
+REGISTER_OP_CPU_KERNEL(
+    uniform_random_batch_size_like,
+    paddle::operators::CPUUniformRandomKernel<float>,
+    paddle::operators::CPUUniformRandomKernel<double>,
+    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 563a6c165b748543516eabbcdb0e1c8b9be8a44d..ceb13a3dda41df0a3177d3291ad409b673a4c20c 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 6052e533643f3c4e5be977a87fceafa932892862..18a4154be30ac7c4c141fe1e4dc8f43a4b42aac7 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -24,9 +24,9 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 inline std::vector<int64_t> GetNewDataFromShapeTensor(
-    const Tensor *new_data_tensor) {
+    const Tensor* new_data_tensor) {
   if (new_data_tensor->type() == framework::proto::VarType::INT64) {
-    auto *new_data = new_data_tensor->data<int64_t>();
+    auto* new_data = new_data_tensor->data<int64_t>();
     framework::Tensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
       TensorCopySync(*new_data_tensor, platform::CPUPlace(),
@@ -37,7 +37,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
                                       new_data + new_data_tensor->numel());
     return vec_new_data;
   } else if (new_data_tensor->type() == framework::proto::VarType::INT32) {
-    auto *new_data = new_data_tensor->data<int32_t>();
+    auto* new_data = new_data_tensor->data<int32_t>();
     std::vector<int64_t> vec_new_data;
     framework::Tensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
@@ -58,7 +58,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
 }
 
 inline std::vector<int64_t> GetNewDataFromShapeTensorList(
-    const std::vector<const Tensor *> &list_new_shape_tensor) {
+    const std::vector<const Tensor*>& list_new_shape_tensor) {
   std::vector<int64_t> vec_new_shape;
   vec_new_shape.reserve(list_new_shape_tensor.size());
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
@@ -97,6 +97,5 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
 
   return vec_new_shape;
 }
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/unsqueeze_op_npu.cc b/paddle/fluid/operators/unsqueeze_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3daeffc13d1a771e57f125939c91ebd060f3cee
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op_npu.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/unsqueeze_op.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    unsqueeze, ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
+REGISTER_OP_NPU_KERNEL(
+    unsqueeze2, ops::UnsqueezeKernel<plat::NPUDeviceContext, float>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, double>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, bool>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int8_t>,
+    ops::UnsqueezeKernel<plat::NPUDeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a145c914a8621b14bc7f2e461ba1265c8f40f9d8
--- /dev/null
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(unsqueeze);
+USE_OP_DEVICE_KERNEL(unsqueeze, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int dim0 = 5;
+  int dim1 = 10;
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < dim0 * dim1; ++i) {
+    init.push_back(static_cast<T>(0.1));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({dim0, dim1});
+
+  ctx.Wait();
+
+  // run
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  std::vector<int> axis;
+  axis.push_back(1);
+  f::AttributeMap attrs = {{"axes", axis}};
+
+  auto op = f::OpRegistry::CreateOp("unsqueeze", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(3));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(5));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(1));
+  EXPECT_EQ((uint32_t)tensor_out->dims()[2], uint32_t(10));
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(0.1));
+  }
+
+  ctx.Wait();
+}
+
+TEST(unsqueeze, NPU_fp32) {
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
+}
diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc
index 2f71f10a1c4177a0accba42f2e7f65a371c1439a..71cc586cb598fdee722c6b4847fe3a9f62a8406b 100644
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
@@ -101,14 +101,18 @@ class UnStackGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0,
                       platform::errors::InvalidArgument(
-                          "Number of Inputs(Y@Grad) must be larger than 0"));
+                          "The Inputs(Y@Grad) of unstack operator are empty."));
     OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", "X",
                    "UnStackGrad");
     auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
     for (size_t i = 1; i < input_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Dims of all Inputs(Y@Grad) must be the same"));
+      PADDLE_ENFORCE_EQ(
+          input_dims[i], input_dims[0],
+          platform::errors::InvalidArgument(
+              "The dimensions of all Inputs(Y@Grad) must be the same,"
+              "but received Inputs(Y@Grad)'s %d-th dimension is %d, "
+              "Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.",
+              i, input_dims[i], i - 1, input_dims[0]));
     }
 
     int axis = ctx->Attrs().Get<int>("axis");
diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h
index 985c35127617bf1c4c708c3ab741ff8ca058af8a..912d538d5e9513bc0f87b5b4593468bf4f138fad 100644
--- a/paddle/fluid/operators/utils.h
+++ b/paddle/fluid/operators/utils.h
@@ -108,5 +108,18 @@ inline framework::DDim GetShape(const framework::ExecutionContext& ctx) {
   return framework::make_ddim(vec_shape);
 }
 
+template <typename T>
+inline T GetValue(const framework::Tensor* x) {
+  T value = static_cast<T>(0);
+  if (!platform::is_cpu_place(x->place())) {
+    framework::Tensor cpu_x;
+    framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
+    value = cpu_x.data<T>()[0];
+  } else {
+    value = x->data<T>()[0];
+  }
+  return value;
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 7451cac63d0cea3ea9e942427f7f641de602b5fa..e90eefd72d4ce29cb20397c42bdbb6697af0e38b 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -159,8 +159,7 @@ class WarpCTCFunctor {
     warpctc_version_ = platform::dynload::get_warpctc_version();
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-// HIP not support ctcOptions in third-party warpctc
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = CTC_GPU;
       options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                             ctx.device_context())
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
index bb968743585f7d3574d477ab54cf657ef2646873..b1cd172923ee6dc421cc09b27163422207ea099c 100644
--- a/paddle/fluid/operators/where_index_op.cu
+++ b/paddle/fluid/operators/where_index_op.cu
@@ -12,7 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/device_vector.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <algorithm>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/where_index_op.h"
@@ -25,52 +33,124 @@ namespace operators {
 using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
 template <typename T>
-class CUDAWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    // TODO(zhoukunsheng): Should optimize to ensure GPU is faster than CPU.
-    framework::Tensor cond_cpu;
-    framework::TensorCopy(*condition, platform::CPUPlace(), &cond_cpu);
-
-    const T* cond_data = cond_cpu.data<T>();
-    int64_t numel = cond_cpu.numel();
-    auto dims = cond_cpu.dims();
-    int rank = dims.size();
-
-    thrust::host_vector<int64_t> h_true_index;
-    for (int64_t i = 0; i < numel; i++) {
-      if (static_cast<bool>(cond_data[i])) {
-        h_true_index.push_back(i);
+__global__ void GetTrueNum(const T *cond_data, const int64_t numel,
+                           int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    true_num_array[idx] =
+        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+  }
+}
+
+template <typename T>
+__global__ void SetTrueIndex(int64_t *out_ptr, const T *cond_data,
+                             const int64_t numel, const int64_t *stride_array,
+                             const int64_t rank,
+                             const int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    // true_num_array is calculated by cub::InclusiveSum,
+    // cause the first element of true_num_array is 1,
+    // so we need substract 1 to get true index.
+    const int64_t true_index = true_num_array[idx] - 1;
+    if (static_cast<bool>(cond_data[idx])) {
+      int64_t rank_index = idx;
+      for (int j = 0; j < rank; j++) {
+        const int64_t out_index = rank_index / stride_array[j];
+        out_ptr[true_index * rank + j] = out_index;
+        rank_index -= out_index * stride_array[j];
       }
     }
-    thrust::device_vector<int64_t> d_true_index = h_true_index;
-    int64_t* ptr_true_index = thrust::raw_pointer_cast(d_true_index.data());
-
-    size_t true_num = h_true_index.size();
+  }
+}
 
+template <typename T>
+class CUDAWhereIndexKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *condition = context.Input<framework::Tensor>("Condition");
+    auto *out = context.Output<framework::Tensor>("Out");
+    auto &dev_ctx = context.template device_context<CUDADeviceContext>();
+
+    const T *cond_data = condition->data<T>();
+    const int64_t numel = condition->numel();
+    auto dims = condition->dims();
+    const int rank = dims.size();
+
+    auto d_array_mem = memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
+    auto h_array_mem =
+        memory::Alloc(platform::CPUPlace(), (rank + 1) * sizeof(int64_t));
+
+    // "stride_array" is an array and len(stride_array)==rank,
+    // each element is the stride of each dimension -- the length from i to i+1.
+    int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
+    int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
+
+    // "true_num_array" is an array and len(stride_array)==numel,
+    // at the beginning,
+    // "true_num_array" will set 1 if condition[i] == true else 0,
+    // then it will be calculated by cub::InclusiveSum,
+    // so that we can get the true number before i as the out index
+    int64_t *d_true_num_array = d_stride_array + rank;
+
+    // the total_true_num is the total number of condition[i] == true
+    int64_t *h_total_true_num = h_stride_array + rank;
+
+    // alloce cub memory
+    size_t cub_size = 0;
+    cub::DeviceScan::InclusiveSum(nullptr, cub_size, d_true_num_array,
+                                  d_true_num_array, numel, dev_ctx.stream());
+    auto cub_mem = memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
+    void *cub_data = cub_mem->ptr();
+
+    // set d_true_num_array[i]=1 if cond_data[i]==true else 0
+    const int threads = std::min(numel, static_cast<int64_t>(128));
+    const int64_t need_grids = (numel + threads - 1) / threads;
+    const int grids = std::min(need_grids, static_cast<int64_t>(256));
+    GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(cond_data, numel,
+                                                           d_true_num_array);
+
+    // calculate the inclusive prefix sum of "true_num_array"
+    // to get the index of "out" tensor,
+    // and the total number of cond_data[i]==true.
+    // Example:
+    // condition: F T T F F F T T
+    // before:    0 1 1 0 0 0 1 1
+    // after:     0 1 2 2 2 2 3 4
+    // out:       1 2 6 7
+    cub::DeviceScan::InclusiveSum(cub_data, cub_size, d_true_num_array,
+                                  d_true_num_array, numel, dev_ctx.stream());
+
+    // calculate each dimension's stride
+    h_stride_array[rank - 1] = 1;
+    for (int i = rank - 2; i >= 0; i--) {
+      h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
+    }
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_stride_array, platform::CPUPlace(), h_stride_array,
+                 rank * sizeof(int64_t), dev_ctx.stream());
+
+    // get total ture number and set output size
+    // the last element of cub::InclusiveSum is the total number
+    memory::Copy(platform::CPUPlace(), h_total_true_num,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_true_num_array + numel - 1, sizeof(int64_t),
+                 dev_ctx.stream());
+    dev_ctx.Wait();
+
+    int64_t true_num = *h_total_true_num;
     out->Resize(framework::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_ptr = out->mutable_data<int64_t>(context.GetPlace());
+    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
 
     if (true_num == 0) {
       return;
     }
 
-    thrust::host_vector<int64_t> h_stride(rank, 0);
-    h_stride[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      h_stride[i] = h_stride[i + 1] * dims[i + 1];
-    }
-    thrust::device_vector<int64_t> d_stride = h_stride;
-    int64_t* ptr_stride = thrust::raw_pointer_cast(d_stride.data());
-
-    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
-    WhereIndexFunctor<int64_t> functor(ptr_true_index, true_num, ptr_stride,
-                                       rank, out_ptr);
-    platform::ForRange<CUDADeviceContext> for_range(dev_ctx, true_num);
-    for_range(functor);
+    // using true_num_array and stride_array to calculate the output index
+    SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+        out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 47344f0e3733d6ce400145755e995d45591d2eef..0827d6a5ae7644579ffc2ab502893ec1e6ab1ee2 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -10,6 +10,12 @@ ELSE()
   set(XPU_CTX_DEPS)
 endif(WITH_XPU)
 
+if(WITH_ASCEND)
+    set(ASCEND_DEPS xpulib)
+ELSE()
+  set(ASCEND_DEPS)
+endif(WITH_ASCEND)
+
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
   add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -66,6 +72,14 @@ if(WITH_XPU)
 cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
 endif()
 
+if(WITH_ASCEND)
+    cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
+endif()
+
+if(WITH_ASCEND_CL)
+    cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor ascendcl acl_op_compiler)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)
 
@@ -81,13 +95,22 @@ IF(WITH_GPU OR WITH_ROCM)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()
 
+IF(WITH_ASCEND_CL)
+    set(NPU_CTX_DEPS npu_stream npu_info)
+ENDIF()
+
 IF(WITH_MKLDNN)
     set(MKLDNN_CTX_DEPS mkldnn)
 ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
+
+IF(WITH_ASCEND_CL)
+cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
+ENDIF()
+
 IF(WITH_GPU)
-    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
@@ -95,6 +118,8 @@ ENDIF()
 
 IF(WITH_GPU OR WITH_ROCM)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
+ELSEIF(WITH_ASCEND_CL)
+  set(STREAM_CALLBACK_DEPS stream_callback_manager)
 ELSE()
   set(STREAM_CALLBACK_DEPS)
 ENDIF()
@@ -108,16 +133,21 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
-cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
+cc_library(collective_helper SRCS collective_helper.cc collective_helper_npu.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
 
 if(WITH_GPU OR WITH_ROCM)
     cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
     target_link_libraries(device_context cuda_resource_pool)
 endif()
 
+if(WITH_ASCEND_CL)
+    cc_library(npu_resource_pool SRCS npu_resource_pool.cc DEPS npu_info)
+    target_link_libraries(device_context npu_resource_pool)
+endif()
+
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
 if(WITH_GPU)
@@ -160,6 +190,7 @@ cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
 
 IF(WITH_GPU)
   nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor)
   nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
   nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 ENDIF()
diff --git a/paddle/fluid/platform/ascend_npu_info.cc b/paddle/fluid/platform/ascend_npu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db8dafeae1e893fc29e9983a2171bb1bc261990e
--- /dev/null
+++ b/paddle/fluid/platform/ascend_npu_info.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/ascend_npu_info.h"
+#include <glog/logging.h>
+#include "acl/acl_rt.h"
+
+namespace paddle {
+namespace platform {
+namespace ascend {
+
+int NPUDevice::GetDeviceCount() {
+  uint32_t count = 0;
+  aclError status = aclrtGetDeviceCount(&count);
+  if (status != 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "aclrtGetDeviceCount error code: %d", status));
+    return -1;
+  }
+
+  return count;
+}
+
+}  // namespace ascend
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_selected_rows_op.cu b/paddle/fluid/platform/ascend_npu_info.h
similarity index 59%
rename from paddle/fluid/operators/split_selected_rows_op.cu
rename to paddle/fluid/platform/ascend_npu_info.h
index 7250917036f611816a457e250b08f028f50f769d..213013f5b12777986dc323aa170ca8ff597894ae 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cu
+++ b/paddle/fluid/platform/ascend_npu_info.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -11,9 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#pragma once
 
-#include "paddle/fluid/operators/split_selected_rows_op.h"
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    split_selected_rows,
-    ops::SplitSelectedRowsOpKernel<paddle::platform::CUDADeviceContext, float>);
+#ifdef PADDLE_WITH_ASCEND_CL
+
+namespace paddle {
+namespace platform {
+namespace ascend {
+
+class NPUDevice {
+ public:
+  //! Get the total number of XPU devices in system.
+  static int GetDeviceCount();
+};
+
+}  // namespace ascend
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
index f373e5ddb6d8ca9b38068b84bc99c6e6b5745a45..a362e2903f24560fa7ba3fbae4833929ccb8b9e3 100644
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -15,22 +15,36 @@
 #pragma once
 
 #include <stdint.h>
+
+#include <cmath>
+#include <cstring>
+#include <iostream>
 #include <limits>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 11000
+#define PADDLE_CUDA_BF16
+#include <cuda_bf16.h>
+#endif
+
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
 #define PADDLE_ALIGN(x) __declspec(align(x))
 #endif
 
-#include <cstring>
-
-#include "paddle/fluid/platform/hostdevice.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace Eigen {
-template <typename T>
-struct NumTraits;
-}  // namespace Eigen
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
 
 namespace paddle {
 namespace platform {
@@ -39,6 +53,7 @@ struct PADDLE_ALIGN(2) bfloat16 {
  public:
   uint16_t x;
 
+  // Constructors
   bfloat16() = default;
   bfloat16(const bfloat16& o) = default;
   bfloat16& operator=(const bfloat16& o) = default;
@@ -55,15 +70,34 @@ struct PADDLE_ALIGN(2) bfloat16 {
     tempRes = reinterpret_cast<uint32_t*>(&val);
     res = *tempRes;
     x = res >> 16;
+#else
+#if defined(PADDLE_CUDA_BF16)
+    __nv_bfloat16 tmp = __float2bfloat16(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
 #else
     std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
 #endif
   }
 
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
+  }
+#endif
+
   template <class T>
   HOSTDEVICE inline explicit bfloat16(const T& val)
       : x(bfloat16(static_cast<float>(val)).x) {}
 
+// Assignment operators
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
+    return *this;
+  }
+#endif
+
   HOSTDEVICE inline bfloat16& operator=(bool b) {
     x = b ? 0x3f80 : 0;
     return *this;
@@ -119,13 +153,24 @@ struct PADDLE_ALIGN(2) bfloat16 {
     return *this;
   }
 
+  // Conversion opertors
   HOSTDEVICE inline explicit operator float() const {
+#ifdef PADDLE_CUDA_BF16
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
     float val = 0.f;
     uint16_t temp = x;
     memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
            2);
     return val;
+#endif
+  }
+
+#ifdef PADDLE_CUDA_BF16
+  HOSTDEVICE inline explicit operator __nv_bfloat16() const {
+    return *reinterpret_cast<const __nv_bfloat16*>(&x);
   }
+#endif
 
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
@@ -218,6 +263,7 @@ HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
   return res;
 }
 
+// Comparison operators
 HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
   return static_cast<float>(a) == static_cast<float>(b);
 }
@@ -351,105 +397,3 @@ struct numeric_limits<paddle::platform::bfloat16> {
 };
 
 }  // namespace std
-
-namespace Eigen {
-
-using bfloat16 = paddle::platform::bfloat16;
-
-template <>
-struct NumTraits<bfloat16> : GenericNumTraits<bfloat16> {
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
-
-  HOSTDEVICE static inline bfloat16 epsilon() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
-  }
-  HOSTDEVICE static inline bfloat16 dummy_precision() {
-    return bfloat16(1e-5f);
-  }
-  HOSTDEVICE static inline bfloat16 highest() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
-  }
-  HOSTDEVICE static inline bfloat16 lowest() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
-  }
-  HOSTDEVICE static inline bfloat16 infinity() {
-    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
-  }
-  HOSTDEVICE static inline bfloat16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
-  }
-};
-namespace numext {
-
-template <>
-HOSTDEVICE inline bool(isnan)(const bfloat16& a) {
-  return (paddle::platform::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const bfloat16& a) {
-  return (paddle::platform::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const bfloat16& a) {
-  return (paddle::platform::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline bfloat16 exp(const bfloat16& a) {
-  return bfloat16(::expf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline bfloat16 erf(const bfloat16& a) {
-  return bfloat16(::erff(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline bfloat16 log(const bfloat16& a) {
-  return bfloat16(::logf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) {
-  return bfloat16(::tanhf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) {
-  return bfloat16(::sqrtf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) {
-  return bfloat16(::ceilf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline bfloat16 floor(const bfloat16& a) {
-  return bfloat16(::floorf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline bfloat16 round(const bfloat16& a) {
-  return bfloat16(::roundf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) {
-  return bfloat16(::powf(static_cast<float>(a), static_cast<float>(b)));
-}
-
-template <>
-HOSTDEVICE inline bfloat16 abs(const bfloat16& a) {
-  return bfloat16(::fabs(static_cast<float>(a)));
-}
-
-}  // namespace numext
-}  // namespace Eigen
diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc
index 3adfcd89be917862151c1a2b71890b59ebf38fbb..dc2d3aa73ba601362b071cf5aa23edd97cbc70b0 100644
--- a/paddle/fluid/platform/bfloat16_test.cc
+++ b/paddle/fluid/platform/bfloat16_test.cc
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/eigen_ext.h"
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dbbb72920a53b06fa9dfdd75e12df68317025b70
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/bfloat16.h"
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#if defined(PADDLE_CUDA_BF16)
+namespace paddle {
+namespace platform {
+
+TEST(bfloat16, convert_float32_to_bfloat16_on_gpu) {
+  // Convert float32 to bfloat16
+  EXPECT_EQ((bfloat16(1.0f)).x, 0x3f80);
+  EXPECT_EQ((bfloat16(0.5f)).x, 0x3f00);
+  EXPECT_EQ((bfloat16(0.33333f)).x, 0x3eab);
+  EXPECT_EQ((bfloat16(0.0f)).x, 0x0000);
+  EXPECT_EQ((bfloat16(-0.0f)).x, 0x8000);
+  EXPECT_EQ((bfloat16(65536.0f)).x, 0x4780);
+}
+
+TEST(bfloat16, assignment_operator_on_gpu) {
+  // Assignment operator
+  bfloat16 v_assign;
+  v_assign = nv_bfloat16(bfloat16(1.0f));
+  EXPECT_EQ(v_assign.x, 0x3f80);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3eab);
+}
+
+TEST(bfloat16, convert_bfloat16_to_float32_on_gpu) {
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(bfloat16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(bfloat16(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(bfloat16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(bfloat16(true)), true);
+}
+
+TEST(bfloat16, lod_tensor_on_gpu) {
+  framework::LoDTensor src_tensor;
+  framework::LoDTensor gpu_tensor;
+  framework::LoDTensor dst_tensor;
+
+  bfloat16 *src_ptr = src_tensor.mutable_data<bfloat16>(
+      framework::make_ddim({2, 2}), CPUPlace());
+
+  bfloat16 arr[4] = {bfloat16(1.0f), bfloat16(0.5f), bfloat16(0.33333f),
+                     bfloat16(0.0f)};
+  memcpy(src_ptr, arr, 4 * sizeof(bfloat16));
+
+  // CPU LoDTensor to GPU LoDTensor
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext gpu_ctx(gpu_place);
+  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
+
+  // GPU LoDTensor to CPU LoDTensor
+  framework::TensorCopy(gpu_tensor, CPUPlace(), gpu_ctx, &dst_tensor);
+
+  // Sync before comparing LoDTensors
+  gpu_ctx.Wait();
+  const bfloat16 *dst_ptr = dst_tensor.data<bfloat16>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 4; ++i) {
+    EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
+  }
+}
+
+TEST(bfloat16, isinf) {
+  bfloat16 a;
+  a.x = 0x7f80;
+  bfloat16 b = bfloat16(INFINITY);
+  bfloat16 c = static_cast<bfloat16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(bfloat16, isnan) {
+  bfloat16 a;
+  a.x = 0x7fff;
+  bfloat16 b = bfloat16(NAN);
+  bfloat16 c = static_cast<bfloat16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
+TEST(bfloat16, cast) {
+  bfloat16 a;
+  a.x = 0x0070;
+  auto b = a;
+  {
+    // change semantic, keep the same value
+    bfloat16 c = reinterpret_cast<bfloat16 &>(reinterpret_cast<unsigned &>(b));
+    EXPECT_EQ(b, c);
+  }
+
+  {
+    // use uint32 low 16 bit store float16
+    uint32_t c = reinterpret_cast<uint32_t &>(b);
+    bfloat16 d;
+    d.x = c;
+    EXPECT_EQ(b, d);
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 197f905ba68a29c0fb13d2e4128892ee0e8d098a..b0b857f7ee3f2ae0e9ca86ee6f51a6fdd047a90f 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -22,6 +22,7 @@
 #include "boost/variant.hpp"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/hccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -126,6 +127,113 @@ class NCCLCommContext {
 };
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+// In order to apply hierarchical communication with HCCL, we need
+// a communication ring contains HCCL communicators associated to a global
+// HCCLUniqueId. E.g. for a hierarchical case,
+//
+//    11 - 12   21 - 22
+//     |    |    |    |
+//    13 - 14 - 23 - 24
+//          |    |
+//    31 - 32 - 41 - 42
+//     |    |    |    |
+//    33 - 34   43 - 44
+//
+// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
+// (31,32,33,34), (41,42,43,44) as bottoms respectively.
+//
+// We could also use a single communication ring for the flatten case
+//
+// The HCCLComm instance is created and reversed in the HCCLCommContext
+// singleton with a global user specified group id.
+class NPUDeviceContext;
+
+#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
+#define ENV_RANK_ID "PADDLE_TRAINER_ID"
+
+class HCCLComm {
+ public:
+  virtual int ring_id() const = 0;
+  virtual int nranks() const = 0;
+  virtual int rank() const = 0;
+  virtual int device_id() const = 0;
+  virtual HcclComm comm() const = 0;
+  virtual aclrtStream stream() const = 0;
+  virtual NPUDeviceContext* dev_context() const = 0;
+  virtual ~HCCLComm() = default;
+};
+
+// A singleton HCCL communicator context reserves communication ring ids
+class HCCLCommContext {
+ public:
+  static HCCLCommContext& Instance() {
+    static HCCLCommContext comm_ctx;
+    return comm_ctx;
+  }
+
+  HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks, int rank,
+                           int dev_id, int ring_id);
+  // a latter comm with the same dev_id and the same ring_id
+  // will override the former
+  HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank, int dev_id,
+                           int ring_id);
+
+  // retrieve a communicator by the ring id in multiprocessing mode
+  HCCLComm* Get(int ring_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator in ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
+                      platform::errors::InvalidArgument(
+                          "One device id should be specified to retrieve from "
+                          "multiple communicators."));
+    return comm_map_.at(ring_id).begin()->second.get();
+  }
+
+  // retrieve a communicator by the ring id and the device id
+  HCCLComm* Get(int ring_id, int dev_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator of ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_GT(
+        comm_map_.at(ring_id).count(dev_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator at device id %d has not been initialized in ring %d.",
+            dev_id, ring_id));
+    return comm_map_.at(ring_id).at(dev_id).get();
+  }
+
+  // retrieve a communicator by the ring id and place
+  HCCLComm* Get(int ring_id, Place place) const {
+    return Get(ring_id, BOOST_GET_CONST(NPUPlace, place).device);
+  }
+
+ private:
+  // Init global hcom
+  HCCLCommContext() {}
+  // we may use group feature in the feature
+  // HCCLCommContext() { InitHcomWorldGroup(); }
+
+  HcclComm comm_;
+
+ public:
+  ~HCCLCommContext() {}
+
+  std::once_flag once_flag_;
+  std::mutex comm_map_mutex_;
+  // ring id to dev-HCCLComm
+  std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
+
+  // void InitHcomWorldGroup();
+  void ReleaseHCCLComms();
+
+  DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
+};
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 // In order to apply hierarchical communication with BKCL, we need
 // a communication ring contains BKCL communicators associated to a global
diff --git a/paddle/fluid/platform/collective_helper_npu.cc b/paddle/fluid/platform/collective_helper_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f30e5fa833d44dae1c42bbd3b8282953f647fe3a
--- /dev/null
+++ b/paddle/fluid/platform/collective_helper_npu.cc
@@ -0,0 +1,145 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include <utility>
+
+namespace paddle {
+namespace platform {
+
+class HCCLCommImpl : public HCCLComm {
+ public:
+  void set_ring_id(int ring_id) { ring_id_ = ring_id; }
+  int ring_id() const override { return ring_id_; }
+
+  void set_nranks(int nranks) { nranks_ = nranks; }
+  int nranks() const override { return nranks_; }
+
+  void set_rank(int rank) { rank_ = rank; }
+  int rank() const override { return rank_; }
+
+  int device_id() const override {
+    return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device;
+  }
+
+  ~HCCLCommImpl() {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
+  }
+
+  void set_comm(HcclComm comm) { comm_ = comm; }
+  HcclComm comm() const override { return comm_; }
+
+  aclrtStream stream() const override { return dev_ctx_->stream(); }
+
+  void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
+    dev_ctx_ = std::move(dev_ctx);
+  }
+  NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
+
+ private:
+  int ring_id_;
+  int nranks_;
+  int rank_;
+  HcclComm comm_;
+  std::unique_ptr<NPUDeviceContext> dev_ctx_;
+};
+
+HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                                          int rank, int dev_id, int ring_id) {
+  PADDLE_ENFORCE_NOT_NULL(hccl_id,
+                          platform::errors::InvalidArgument(
+                              "The hccl unique id should not be null."));
+  PADDLE_ENFORCE_GT(
+      nranks, 1,
+      platform::errors::InvalidArgument(
+          "Expected nranks > 1. But received nranks is %d.", nranks));
+  PADDLE_ENFORCE_GE(rank, 0,
+                    platform::errors::InvalidArgument(
+                        "Expected rank >= 0. But received rank is %d.", rank));
+  PADDLE_ENFORCE_LT(
+      rank, nranks,
+      platform::errors::InvalidArgument(
+          "Expected rank < nranks. But received rank is %d, nranks is %d.",
+          rank, nranks));
+  PADDLE_ENFORCE_GE(
+      dev_id, 0,
+      platform::errors::InvalidArgument(
+          "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
+
+  HcclComm comm;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id));
+  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
+          << ", hccl_id: " << hccl_id << ", rank: " << rank;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm));
+
+  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
+          << ", hccl_id: " << hccl_id << ", rank: " << rank;
+
+  auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id);
+
+  VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id
+          << " has been created on device " << dev_id
+          << ", with comm: " << comm_wrapper->comm();
+
+  std::call_once(once_flag_, []() {
+    std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); });
+  });
+
+  return comm_wrapper;
+}
+
+HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id) {
+  std::unique_ptr<NPUDeviceContext> dev_ctx(
+      new NPUDeviceContext(NPUPlace(dev_id)));
+
+  HCCLCommImpl* c = new HCCLCommImpl;
+  c->set_ring_id(ring_id);
+  c->set_nranks(nranks);
+  c->set_rank(rank);
+  c->set_comm(comm);
+  c->set_dev_ctx(std::move(dev_ctx));
+
+  comm_map_mutex_.lock();
+  if (comm_map_.count(ring_id) == 0) {
+    comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<HCCLComm>>());
+  }
+  auto& dev2comm = comm_map_[ring_id];
+
+  dev2comm.emplace(dev_id, std::unique_ptr<HCCLComm>(c));
+  comm_map_mutex_.unlock();
+
+  if (ring_id == 0) {
+    auto* dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(
+            platform::NPUPlace(dev_id)));
+    dev_ctx->set_hccl_comm(comm);
+  }
+
+  return comm_map_[ring_id][dev_id].get();
+}
+
+void HCCLCommContext::ReleaseHCCLComms() {
+  for (auto& p : comm_map_) {
+    for (auto& q : p.second) {
+      q.second.reset();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/complex128.h b/paddle/fluid/platform/complex128.h
index c50ff2f810393db337380c44bb7da6a09017aefb..da2f83c3497cce7b162336360690e1e76bce8b19 100644
--- a/paddle/fluid/platform/complex128.h
+++ b/paddle/fluid/platform/complex128.h
@@ -16,12 +16,10 @@
 
 #include <stdint.h>
 
+#include <complex>
+#include <cstring>
+#include <iostream>
 #include <limits>
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuComplex.h>
@@ -33,15 +31,25 @@
 #include <thrust/complex.h>  // NOLINT
 #endif
 
-#include <cstring>
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
 
-#include "paddle/fluid/platform/hostdevice.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
 
-namespace Eigen {
-template <typename T>
-struct NumTraits;
-}  // namespace Eigen
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX128
+#endif
 
 namespace paddle {
 namespace platform {
@@ -213,7 +221,8 @@ struct PADDLE_ALIGN(16) complex128 {
 
 HOSTDEVICE inline complex128 operator+(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) +
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -223,7 +232,8 @@ HOSTDEVICE inline complex128 operator+(const complex128& a,
 
 HOSTDEVICE inline complex128 operator-(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) -
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -233,7 +243,8 @@ HOSTDEVICE inline complex128 operator-(const complex128& a,
 
 HOSTDEVICE inline complex128 operator*(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) *
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -244,7 +255,8 @@ HOSTDEVICE inline complex128 operator*(const complex128& a,
 
 HOSTDEVICE inline complex128 operator/(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::complex<double>(a.real, a.imag) /
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -255,7 +267,8 @@ HOSTDEVICE inline complex128 operator/(const complex128& a,
 }
 
 HOSTDEVICE inline complex128 operator-(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(-thrust::complex<double>(a.real, a.imag));
 #else
   complex128 res;
@@ -267,7 +280,8 @@ HOSTDEVICE inline complex128 operator-(const complex128& a) {
 
 HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) +=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -280,7 +294,8 @@ HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) -=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -293,7 +308,8 @@ HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) *=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -306,7 +322,8 @@ HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator/=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex128(thrust::complex<double>(a.real, a.imag) /=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -349,7 +366,7 @@ HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) {
 }
 
 HOSTDEVICE inline bool(isnan)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isnanf not supported on HIP platform
   return __isnan(a.real) || __isnan(a.imag);
 #else
@@ -358,7 +375,7 @@ HOSTDEVICE inline bool(isnan)(const complex128& a) {
 }
 
 HOSTDEVICE inline bool(isinf)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isinf not supported on HIP platform
   return __isinf(a.real) || __isinf(a.imag);
 #else
@@ -371,7 +388,8 @@ HOSTDEVICE inline bool(isfinite)(const complex128& a) {
 }
 
 HOSTDEVICE inline double(abs)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return thrust::abs(thrust::complex<double>(a.real, a.imag));
 #else
   return std::abs(std::complex<double>(a.real, a.imag));
@@ -379,7 +397,8 @@ HOSTDEVICE inline double(abs)(const complex128& a) {
 }
 
 HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::pow(thrust::complex<double>(a.real, a.imag),
                                 thrust::complex<double>(b.real, b.imag)));
 #else
@@ -388,7 +407,8 @@ HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
 }
 
 HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::sqrt(thrust::complex<double>(a.real, a.imag)));
 #else
   return std::sqrt(std::complex<double>(a));
@@ -396,7 +416,8 @@ HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
 }
 
 HOSTDEVICE inline complex128(tanh)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::tanh(thrust::complex<double>(a.real, a.imag)));
 #else
   return std::tanh(std::complex<double>(a));
@@ -404,7 +425,8 @@ HOSTDEVICE inline complex128(tanh)(const complex128& a) {
 }
 
 HOSTDEVICE inline complex128(log)(const complex128& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex128(thrust::log(thrust::complex<double>(a.real, a.imag)));
 #else
   return complex128(std::log(std::complex<double>(a)));
@@ -509,97 +531,5 @@ struct numeric_limits<paddle::platform::complex128> {
 };
 
 }  // namespace std
-namespace Eigen {
-
-using complex128 = paddle::platform::complex128;
-
-template <>
-struct NumTraits<complex128> : GenericNumTraits<std::complex<double>> {
-  typedef double Real;
-  typedef typename NumTraits<double>::Literal Literal;
-  enum {
-    IsComplex = 1,
-    RequireInitialization = NumTraits<double>::RequireInitialization,
-    ReadCost = 2 * NumTraits<double>::ReadCost,
-    AddCost = 2 * NumTraits<Real>::AddCost,
-    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
-  };
-
-  EIGEN_DEVICE_FUNC
-  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  EIGEN_DEVICE_FUNC
-  static inline Real dummy_precision() {
-    return NumTraits<Real>::dummy_precision();
-  }
-  EIGEN_DEVICE_FUNC
-  static inline int digits10() { return NumTraits<Real>::digits10(); }
-};
-namespace numext {
-
-template <>
-HOSTDEVICE inline bool(isnan)(const complex128& a) {
-  return (paddle::platform::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const complex128& a) {
-  return (paddle::platform::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const complex128& a) {
-  return (paddle::platform::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 exp(const complex128& a) {
-  double com = ::expf(a.real);
-  double res_real = com * ::cosf(a.imag);
-  double res_imag = com * ::sinf(a.imag);
-  return complex128(res_real, res_imag);
-}
-
-template <>
-HOSTDEVICE inline complex128 log(const complex128& a) {
-  return paddle::platform::log(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 tanh(const complex128& a) {
-  return paddle::platform::tanh(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 sqrt(const complex128& a) {
-  return paddle::platform::sqrt(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 ceil(const complex128& a) {
-  return complex128(::ceilf(a.real), ::ceilf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex128 floor(const complex128& a) {
-  return complex128(::floorf(a.real), ::floor(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex128 round(const complex128& a) {
-  return complex128(::roundf(a.real), ::roundf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex128 pow(const complex128& a, const complex128& b) {
-  return paddle::platform::pow(a, b);
-}
-
-template <>
-HOSTDEVICE inline double abs(const complex128& a) {
-  return paddle::platform::abs(a);
-}
-
-}  // namespace numext
-}  // namespace Eigen
 
 #define MKL_Complex16 paddle::platform::complex128
diff --git a/paddle/fluid/platform/complex64.h b/paddle/fluid/platform/complex64.h
index b91fdbab28b0b6d6e9529b3c31d4a118e97d3f11..0aad7bd9dd2a8f1d59833720b442e34afa176ca6 100644
--- a/paddle/fluid/platform/complex64.h
+++ b/paddle/fluid/platform/complex64.h
@@ -15,12 +15,11 @@
 #pragma once
 
 #include <stdint.h>
+
+#include <complex>
+#include <cstring>
+#include <iostream>
 #include <limits>
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuComplex.h>
@@ -32,16 +31,27 @@
 #include <thrust/complex.h>  // NOLINT
 #endif
 
-#include <cstring>
+#if !defined(_WIN32)
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+#else
+#define PADDLE_ALIGN(x) __declspec(align(x))
+#endif
+
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
 
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/hostdevice.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX64
+#endif
 
-namespace Eigen {
-template <typename T>
-struct NumTraits;
-}  // namespace Eigen
+#include "complex128.h"  // NOLINT
 
 namespace paddle {
 namespace platform {
@@ -218,7 +228,8 @@ struct PADDLE_ALIGN(8) complex64 {
 };
 
 HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) +
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -227,7 +238,8 @@ HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) -
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -236,7 +248,8 @@ HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) *
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -246,7 +259,8 @@ HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::complex<float>(a.real, a.imag) /
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -257,7 +271,8 @@ HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator-(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(-thrust::complex<float>(a.real, a.imag));
 #else
   complex64 res;
@@ -269,7 +284,8 @@ HOSTDEVICE inline complex64 operator-(const complex64& a) {
 
 HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) +=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -282,7 +298,8 @@ HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) -=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -295,7 +312,8 @@ HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) *=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -308,7 +326,8 @@ HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator/=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   a = complex64(thrust::complex<float>(a.real, a.imag) /=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -351,7 +370,7 @@ HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline bool(isnan)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isnanf not supported on HIP platform
   return __isnanf(a.real) || __isnanf(a.imag);
 #else
@@ -360,7 +379,7 @@ HOSTDEVICE inline bool(isnan)(const complex64& a) {
 }
 
 HOSTDEVICE inline bool(isinf)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
   // __isinff not supported on HIP platform
   return __isinff(a.real) || __isinff(a.imag);
 #else
@@ -373,7 +392,8 @@ HOSTDEVICE inline bool(isfinite)(const complex64& a) {
 }
 
 HOSTDEVICE inline float(abs)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::abs(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::abs(std::complex<float>(a.real, a.imag));
@@ -381,7 +401,8 @@ HOSTDEVICE inline float(abs)(const complex64& a) {
 }
 
 HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::pow(thrust::complex<float>(a.real, a.imag),
                                thrust::complex<float>(b.real, b.imag)));
 #else
@@ -390,7 +411,8 @@ HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::sqrt(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::sqrt(std::complex<float>(a));
@@ -398,7 +420,8 @@ HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
 }
 
 HOSTDEVICE inline complex64(tanh)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::tanh(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::tanh(std::complex<float>(a));
@@ -406,7 +429,8 @@ HOSTDEVICE inline complex64(tanh)(const complex64& a) {
 }
 
 HOSTDEVICE inline complex64(log)(const complex64& a) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
+    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
   return complex64(thrust::log(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::log(std::complex<float>(a));
@@ -510,98 +534,5 @@ struct numeric_limits<paddle::platform::complex64> {
 };
 
 }  // namespace std
-namespace Eigen {
-
-using complex64 = paddle::platform::complex64;
-
-template <>
-struct NumTraits<complex64> : GenericNumTraits<std::complex<float>> {
-  typedef float Real;
-  typedef typename NumTraits<float>::Literal Literal;
-  enum {
-    IsComplex = 1,
-    RequireInitialization = NumTraits<float>::RequireInitialization,
-    ReadCost = 2 * NumTraits<float>::ReadCost,
-    AddCost = 2 * NumTraits<Real>::AddCost,
-    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
-  };
-
-  EIGEN_DEVICE_FUNC
-  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  EIGEN_DEVICE_FUNC
-  static inline Real dummy_precision() {
-    return NumTraits<Real>::dummy_precision();
-  }
-  EIGEN_DEVICE_FUNC
-  static inline int digits10() { return NumTraits<Real>::digits10(); }
-};
-
-namespace numext {
-
-template <>
-HOSTDEVICE inline bool(isnan)(const complex64& a) {
-  return (paddle::platform::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const complex64& a) {
-  return (paddle::platform::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const complex64& a) {
-  return (paddle::platform::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline complex64 exp(const complex64& a) {
-  float com = ::expf(a.real);
-  float res_real = com * ::cosf(a.imag);
-  float res_imag = com * ::sinf(a.imag);
-  return complex64(res_real, res_imag);
-}
-
-template <>
-HOSTDEVICE inline complex64 log(const complex64& a) {
-  return paddle::platform::log(a);
-}
-
-template <>
-HOSTDEVICE inline complex64 tanh(const complex64& a) {
-  return paddle::platform::tanh(a);
-}
-
-template <>
-HOSTDEVICE inline complex64 sqrt(const complex64& a) {
-  return paddle::platform::sqrt(a);
-}
-
-template <>
-HOSTDEVICE inline complex64 ceil(const complex64& a) {
-  return complex64(::ceilf(a.real), ::ceilf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex64 floor(const complex64& a) {
-  return complex64(::floorf(a.real), ::floor(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex64 round(const complex64& a) {
-  return complex64(::roundf(a.real), ::roundf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex64 pow(const complex64& a, const complex64& b) {
-  return paddle::platform::pow(a, b);
-}
-
-template <>
-HOSTDEVICE inline float abs(const complex64& a) {
-  return paddle::platform::abs(a);
-}
-
-}  // namespace numext
-}  // namespace Eigen
 
 #define MKL_Complex8 paddle::platform::complex64
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 4f504b414de4a7e8413fcb8f853c07d55ecff493..dde9531e59144218c91d789a8fe668d3fffb70f2 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -26,14 +26,10 @@ namespace platform {
 #ifdef PADDLE_WITH_HIP
 #define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
 #else
-#if CUDA_VERSION < 9000
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
 #define FULL_WARP_MASK 0xFFFFFFFF
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
-#endif
 
 inline static int RoundToPowerOfTwo(int dim) {
   if (dim > 512) {
@@ -69,7 +65,7 @@ template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta,
                                                  int width = warpSize) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
   return __shfl_down(val, delta, width);
 #else
   return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
@@ -79,7 +75,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 template <typename T>
 __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
                                                 int width = warpSize) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
   return __shfl_xor(val, width);
 #else
   return __shfl_xor_sync(mask, val, width);
@@ -87,7 +83,7 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
 }
 
 // CUDA 9.0 have native compatible float16 shfl_down
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
@@ -170,7 +166,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
   return __shfl(val, src_line, width);
 #else
   return __shfl_sync(mask, val, src_line, width);
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index fa4ef3f8c124e407a2494828d390e2c8d6c2c8ca..202be920c559535eeea813b3b27dd3fa48011048 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -25,10 +25,6 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
-enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
-#endif
-
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 340372007a77b0faa61f027424b843b74afea4f3..94f64d158afbcbc702e5c1a47cefb61a9118067b 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -200,6 +200,8 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
 
     old = atomicCAS(address_as_i, assumed, __float_as_int(val));
   } while (assumed != old);
+
+  return __int_as_float(old);
 }
 
 CUDA_ATOMIC_WRAPPER(Max, double) {
@@ -219,6 +221,8 @@ CUDA_ATOMIC_WRAPPER(Max, double) {
 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
   } while (assumed != old);
+
+  return __longlong_as_double(old);
 }
 
 // For atomicMin
@@ -272,6 +276,8 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
 
     old = atomicCAS(address_as_i, assumed, __float_as_int(val));
   } while (assumed != old);
+
+  return __int_as_float(old);
 }
 
 CUDA_ATOMIC_WRAPPER(Min, double) {
@@ -291,6 +297,8 @@ CUDA_ATOMIC_WRAPPER(Min, double) {
 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
   } while (assumed != old);
+
+  return __longlong_as_double(old);
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index af0df2efc5e6d59dd6dbb32223d36da6cb591837..6c3c96b68c48a1314f4a90a97a2542ea3060446a 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -91,30 +91,6 @@ enum class ActivationMode {
   kBandPass,
 };
 
-#if CUDNN_VERSION < 6000
-#pragma message "CUDNN version under 6.0 is supported at best effort."
-#pragma message "We strongly encourage you to move to 6.0 and above."
-#pragma message "This message is intended to annoy you enough to update."
-#pragma message \
-    "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/"
-
-inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
-  switch (mode) {
-    case PoolingMode::kMaximumDeterministic:
-      return CUDNN_POOLING_MAX;
-    case PoolingMode::kAverageExclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
-    case PoolingMode::kAverageInclusive:
-      return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-    case PoolingMode::kMaximum:
-      return CUDNN_POOLING_MAX;
-    default:
-      PADDLE_THROW(
-          platform::errors::Unimplemented("Unexpected CUDNN pooling mode."));
-  }
-}
-#else
-
 inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
   switch (mode) {
     case PoolingMode::kMaximumDeterministic:
@@ -130,7 +106,6 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) {
           platform::errors::Unimplemented("Unexpected CUDNN pooling mode."));
   }
 }
-#endif  // CUDNN_VERSION < 6000
 
 inline ActivationMode StringToActivationMode(const std::string& str) {
   if (str == "identity") {
@@ -471,19 +446,6 @@ class ScopedConvolutionDescriptor {
             "of pads is %d, size of dilations is %d.",
             pads.size(), dilations.size()));
 
-#if !CUDNN_VERSION_MIN(6, 0, 0)
-    // cudnn v5 does not support dilation conv, the argument is called upscale
-    // instead of dilations and it is must be one.
-    for (size_t i = 0; i < dilations.size(); ++i) {
-      PADDLE_ENFORCE_EQ(dilations[i], 1,
-                        platform::errors::InvalidArgument(
-                            "Dilations conv is not supported in this cuDNN "
-                            "version(%d.%d.%d).",
-                            CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
-                            CUDNN_VERSION % 100));
-    }
-#endif
-
     cudnnDataType_t compute_type =
         (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
diff --git a/paddle/fluid/platform/denormal.cc b/paddle/fluid/platform/denormal.cc
index 35e9804e2a30814ced7ed24f73da797ff7042165..4af156d1577dd9ad538af586710a86ed647d9e7c 100644
--- a/paddle/fluid/platform/denormal.cc
+++ b/paddle/fluid/platform/denormal.cc
@@ -28,7 +28,7 @@
 #endif
 
 #if !defined(GCC_WITHOUT_INTRINSICS) && !defined(PADDLE_WITH_ARM) && \
-    !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
+    !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS) && !defined(_WIN32)
 #define DENORM_USE_INTRINSICS
 #endif
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 22daaf101cf200891ee98fd2c5c12b944d76825c..9a47ac45462ed7080d34404891fb8410a71d3938 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-
 #include "glog/logging.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace memory {
@@ -78,13 +78,13 @@ bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 DeviceContextPool* DeviceContextPool::pool = nullptr;
 
 platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
+  VLOG(4) << "DeviceContextPool Get: " << place;
   auto it = device_contexts_.find(place);
   if (it == device_contexts_.end()) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Place %s is not supported. Please check that your paddle compiles "
-        "with WITH_GPU or WITH_XPU option or check that your train process "
-        "hold the "
-        "correct gpu_id if you use Executor.",
+        "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
+        "your train process set the correct device id if you use Executor.",
         place));
   }
   return it->second.get().get();
@@ -145,6 +145,14 @@ DeviceContextPool::DeviceContextPool(
       PADDLE_THROW(
           platform::errors::Unimplemented("XPUPlace is not supported. Please "
                                           "re-compile with WITH_XPU option."));
+#endif
+    } else if (platform::is_npu_place(p)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      EmplaceDeviceContext<NPUDeviceContext, NPUPlace>(&device_contexts_, p);
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPlace is not supported. Please "
+          "re-compile with WITH_ASCEND_CL option."));
 #endif
     }
   }
@@ -229,8 +237,36 @@ Place XPUDeviceContext::GetPlace() const { return place_; }
 xpu::Context* XPUDeviceContext::x_context() const { return context_; }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_ASCEND_CL
+NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) {
+  NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
+  // NOTE(zhiqiu): Usually, no need to create context explicitly,
+  // ACL creates a default context which contains 1 default stream
+  // and 1 sync strean after aclrtSetDevice.
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_));
+  stream_.reset(new stream::NPUStream(place));
+}
+
+NPUDeviceContext::~NPUDeviceContext() {
+  // NPUDeviceGuard guard(place_.device);
+  // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
+}
+
+void NPUDeviceContext::Wait() const {
+  platform::RecordEvent record_event("NPUDeviceContext/wait");
+  VLOG(4) << "NPU context(" << this << ")  Wait";
+  stream_->Wait();
+}
+
+aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
 
+Place NPUDeviceContext::GetPlace() const { return place_; }
+
+aclrtContext NPUDeviceContext::context() const { return context_; }
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
  public:
   EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -501,6 +537,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
     : CPUDeviceContext(place), p_blobmap_() {
   p_blobmap_.reset(new BlobMap());
+  p_exec_items_.reset(new ExecMap());
   p_mutex_.reset(new std::mutex());
 }
 
@@ -524,7 +561,7 @@ MKLDNNDeviceContextThreadLocals::Body::~Body() {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   platform::MKLDNNDeviceContext* dev_ctx =
       (platform::MKLDNNDeviceContext*)pool.Get(cpu_place);
-  dev_ctx->ResetBlobMap();
+  dev_ctx->ResetBlobMap(exec_ptr_);
 }
 
 void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id(
@@ -571,17 +608,34 @@ mkldnn::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
   return cur_stream;
 }
 
-void MKLDNNDeviceContext::ResetBlobMap() {
+void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
   if (!block_next_cache_clearing_) {
     VLOG(3) << "Clearing DNNL cache.";
-    p_blobmap_->clear();
+    // If no specific executor pointer then clear
+    // everything. For executor pointer then clear only
+    // objects allocated when using given executor
+    if (ptr == nullptr) {
+      p_blobmap_->clear();
+    } else {
+      for (auto& v : (*p_exec_items_)[ptr]) {
+        (v.first)->erase(v.second);
+      }
+      p_exec_items_->erase(ptr);
+    }
   } else {
     VLOG(3) << "Prevented Clearing DNNL cache.";
     block_next_cache_clearing_ = false;
   }
 }
 
+void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
+                                                KeyBlob::iterator it) const {
+  // Take current executor addess from TLS
+  // and for this executor's items add the one defined with arguments
+  (*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+}
+
 void MKLDNNDeviceContext::BlockNextCacheClearing() {
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
   VLOG(3) << "Next DNNL cache clearing has been blocked.";
@@ -646,7 +700,11 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   // Find Blob via name
   auto blob_it = pBlob->find(name);
   if (blob_it == pBlob->end()) {
-    (*pBlob)[name] = data;
+    auto el =
+        pBlob->insert(std::make_pair(name, data));  //  (*pBlob)[name] = data;
+    // Register new element in per executor map
+    // to have easily erased when executor terminated
+    LinkEntryWithExecutor(pBlob, el.first);
   } else {
     blob_it->second = data;  // set data to existing blob
   }
@@ -706,6 +764,5 @@ MKLDNNDeviceContext::BlobPtr_t<void> MKLDNNDeviceContext::GetBlob(
 }
 
 #endif
-
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 411fe09c864aa2899ada083d25374cd7c7c247b9..d91e14ec3aa923b81976f953d9673175d5217b21 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -57,6 +57,9 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/stream/npu_stream.h"
+#endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace Eigen {
@@ -69,6 +72,11 @@ struct GpuDevice;
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "acl/acl.h"
+#include "paddle/fluid/platform/npu_info.h"
+#endif
+
 namespace paddle {
 namespace platform {
 
@@ -77,6 +85,7 @@ namespace platform {
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
 bool AllowTF32Cublas();
+extern bool allow_tf32_cudnn;
 /*Set the value of the global variable allow_tf32_cudnn*/
 void SetAllowTF32Cudnn(bool active);
 /*Get the global variable allow_tf32_cudnn value*/
@@ -87,11 +96,13 @@ enum DeviceType {
   CPU = 0,
   CUDA = 1,
   XPU = 2,
+  NPU = 3,
 };
 
 constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
+constexpr DeviceType kNPU = DeviceType::NPU;
 
 class DeviceContext {
  public:
@@ -163,8 +174,68 @@ struct DefaultDeviceContextType<platform::XPUPlace> {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUDeviceContext : public DeviceContext {
+ public:
+  explicit NPUDeviceContext(NPUPlace place);
+  virtual ~NPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  aclrtContext context() const;
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+  /*! \brief  Return npu stream in the device context. */
+  aclrtStream stream() const;
+
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    return stream_->AddCallback(callback);
+  }
+
+  void WaitStreamCallback() const { return stream_->WaitCallback(); }
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+  /*! \brief  Return hccl communicators. */
+  HcclComm hccl_comm() const { return hccl_comm_; }
+
+  /*! \brief  Set hccl communicators. */
+  void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
+#endif
+
+  // template <typename Callback>
+  // void AddStreamCallback(Callback&& callback) const {
+  //   return stream_->AddCallback(callback);
+  // }
 
+  // void WaitStreamCallback() const { return stream_->WaitCallback(); }
+
+ private:
+  NPUPlace place_;
+  aclrtContext context_;
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  // HCCLContext_t hccl_context_;
+  HcclComm hccl_comm_{nullptr};
+#endif
+
+  // Need to be the same with other DeviceContext,
+  // Eventhough eigen_device_ is not used in NPU
+  // NOTE(zhiqiu): why need?
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+  std::shared_ptr<stream::NPUStream> stream_;
+
+  DISABLE_COPY_AND_ASSIGN(NPUDeviceContext);
+};
+
+template <>
+struct DefaultDeviceContextType<platform::NPUPlace> {
+  using TYPE = NPUDeviceContext;
+};
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class CudnnWorkspaceHandle;
 class EigenCudaStreamDevice;
 
@@ -600,6 +671,9 @@ class MKLDNNDeviceContextThreadLocals {
     // MKL-DNN stream used for execution of primitives (per-thread)
     mkldnn::engine cur_engine;
     mkldnn::stream cur_stream;
+    std::string key_suffix;  // Key identifying current Executor
+    bool key_attach_thread_id = true;
+    void* exec_ptr_ = nullptr;
 
     Body();
     ~Body();
@@ -612,6 +686,12 @@ class MKLDNNDeviceContextThreadLocals {
     void log_lib_version(void);
     const mkldnn::engine& get_engine(void);
     mkldnn::stream& get_stream(void);
+    void set_key_suffix(const std::string& suffix) { key_suffix = suffix; }
+    const std::string& get_key_suffix(void) const { return key_suffix; }
+    void disable_tid_in_key(void) { key_attach_thread_id = false; }
+    bool is_tid_used_in_key(void) const { return key_attach_thread_id; }
+    void set_curr_exec(void* exec_ptr) { exec_ptr_ = exec_ptr; }
+    void* get_curr_exec(void) const { return exec_ptr_; }
   };
   MKLDNNDeviceContextThreadLocals() = default;
   MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) =
@@ -647,21 +727,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   using ShapeBlob = umap_key_string_t<KeyBlob>;
   using BlobMap = umap_value_smart_t<int, ShapeBlob>;
 
+  using ExecMap = std::unordered_map<
+      void*, std::vector<std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>>>;
+
   explicit MKLDNNDeviceContext(CPUPlace place);
 
   /* \brief  Get the active engine */
   const mkldnn::engine& GetEngine() const { return tls().get_engine(); }
 
-  // Remove all entries from the blob map
-  void ResetBlobMap();
-
-  // Set a suffix to be added to key
-  void SetKeySuffix(const std::string& suffix) { key_suffix_ = suffix; }
-  const std::string& GetKeySuffix(void) const { return key_suffix_; }
+  // Register object to currently used executor's map
+  void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
 
-  // Disable adding  thread ID to the key
-  void DisableThreadInfoInKey(void) { key_attach_thread_id_ = false; }
-  bool IsThreadIdUsedInKey(void) const { return key_attach_thread_id_; }
+  // Remove all entries from the blob map
+  void ResetBlobMap(void* ptr);
 
   // Prevent next ResetBlobMap()
   void BlockNextCacheClearing();
@@ -684,10 +762,11 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 
  private:
   std::shared_ptr<BlobMap> p_blobmap_;
+  // Map key is pointer of executor and value is a data(iterator in map) needed
+  // to erase
+  std::shared_ptr<ExecMap> p_exec_items_;
   std::shared_ptr<std::mutex> p_mutex_;
   bool block_next_cache_clearing_ = false;
-  std::string key_suffix_;  // Key identifying current Executor
-  bool key_attach_thread_id_ = true;
 };
 #endif
 
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 717b5ce83c6c988a17c040d04494692fa8e6b363..724a9b8483cdee5d98cd2988aea7e57c9bfc8ff5 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -587,6 +587,8 @@ class DeviceTracerImpl : public DeviceTracer {
               BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId());
         } else if (platform::is_cuda_pinned_place(r.place)) {
           event->set_place(proto::MemEvent::CUDAPinnedPlace);
+        } else if (platform::is_npu_place(r.place)) {
+          event->set_place(proto::MemEvent::NPUPlace);
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "The current place is not supported."));
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index e65a38cd323aaf2491100b0c8308064ecd62e766..21d9e8607459a484328c785242f4112cc3951263 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -2,6 +2,10 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
 
+if (NOT WITH_NV_JETSON)
+    list(APPEND CUDA_SRCS nvjpeg.cc)
+endif()
+
 if (WITH_ROCM)
   list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
 endif()
@@ -9,7 +13,7 @@ endif()
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
 if (NOT APPLE AND NOT WIN32)
-  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+    list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
   if (WITH_NCCL)
     list(APPEND CUDA_SRCS nccl.cc)
   endif()
@@ -32,6 +36,8 @@ endif(CUPTI_FOUND)
 if(WITH_ROCM)
   hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
   cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+elseif (WITH_ASCEND_CL)
+  cc_library(dynload_warpctc SRCS warpctc.cc hccl.cc DEPS dynamic_loader warpctc)
 else()
   nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
   cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 4c59fe5e9bae4b751d87b0d2feb1ea0bd02bcf1d..366762401c741e570204f5c9d146343a3d60aa33 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -24,26 +24,9 @@ void* cudnn_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
-CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
 
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R5
-CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R6
-CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP);
 #endif
 
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index db84b8731f9ca467c4521221a3dbe0b1fc61b597..4828a97e4df4d54000739adff28bc861d2da2213 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #include <glog/logging.h>
 #include <mutex>  // NOLINT
@@ -48,121 +49,93 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor);                    \
-  __macro(cudnnSetTensor4dDescriptorEx);                  \
-  __macro(cudnnSetTensorNdDescriptor);                    \
-  __macro(cudnnGetTensorNdDescriptor);                    \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);         \
-  __macro(cudnnCreateTensorDescriptor);                   \
-  __macro(cudnnDestroyTensorDescriptor);                  \
-  __macro(cudnnCreateFilterDescriptor);                   \
-  __macro(cudnnSetFilter4dDescriptor);                    \
-  __macro(cudnnSetFilterNdDescriptor);                    \
-  __macro(cudnnGetFilterNdDescriptor);                    \
-  __macro(cudnnSetPooling2dDescriptor);                   \
-  __macro(cudnnSetPoolingNdDescriptor);                   \
-  __macro(cudnnGetPoolingNdDescriptor);                   \
-  __macro(cudnnDestroyFilterDescriptor);                  \
-  __macro(cudnnCreateConvolutionDescriptor);              \
-  __macro(cudnnCreatePoolingDescriptor);                  \
-  __macro(cudnnDestroyPoolingDescriptor);                 \
-  __macro(cudnnSetConvolution2dDescriptor);               \
-  __macro(cudnnDestroyConvolutionDescriptor);             \
-  __macro(cudnnSetConvolutionNdDescriptor);               \
-  __macro(cudnnGetConvolutionNdDescriptor);               \
-  __macro(cudnnDeriveBNTensorDescriptor);                 \
-  __macro(cudnnCreateSpatialTransformerDescriptor);       \
-  __macro(cudnnSetSpatialTransformerNdDescriptor);        \
-  __macro(cudnnDestroySpatialTransformerDescriptor);      \
-  __macro(cudnnSpatialTfGridGeneratorForward);            \
-  __macro(cudnnSpatialTfGridGeneratorBackward);           \
-  __macro(cudnnSpatialTfSamplerForward);                  \
-  __macro(cudnnSpatialTfSamplerBackward);                 \
-  __macro(cudnnCreate);                                   \
-  __macro(cudnnDestroy);                                  \
-  __macro(cudnnSetStream);                                \
-  __macro(cudnnActivationForward);                        \
-  __macro(cudnnActivationBackward);                       \
-  __macro(cudnnConvolutionForward);                       \
-  __macro(cudnnConvolutionBackwardBias);                  \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize);       \
-  __macro(cudnnTransformTensor);                          \
-  __macro(cudnnPoolingForward);                           \
-  __macro(cudnnPoolingBackward);                          \
-  __macro(cudnnSoftmaxBackward);                          \
-  __macro(cudnnSoftmaxForward);                           \
-  __macro(cudnnGetVersion);                               \
-  __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);   \
-  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
-  __macro(cudnnGetErrorString);                           \
-  __macro(cudnnCreateDropoutDescriptor);                  \
-  __macro(cudnnDropoutGetStatesSize);                     \
-  __macro(cudnnSetDropoutDescriptor);                     \
-  __macro(cudnnRestoreDropoutDescriptor);                 \
-  __macro(cudnnCreateRNNDescriptor);                      \
-  __macro(cudnnGetRNNParamsSize);                         \
-  __macro(cudnnGetRNNWorkspaceSize);                      \
-  __macro(cudnnGetRNNTrainingReserveSize);                \
-  __macro(cudnnRNNForwardTraining);                       \
-  __macro(cudnnRNNBackwardData);                          \
-  __macro(cudnnRNNBackwardWeights);                       \
-  __macro(cudnnRNNForwardInference);                      \
-  __macro(cudnnDestroyDropoutDescriptor);                 \
-  __macro(cudnnDestroyRNNDescriptor);                     \
-  __macro(cudnnSetTensorNdDescriptorEx);
-
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
-  __macro(cudnnAddTensor);                 \
-  __macro(cudnnConvolutionBackwardData);   \
-  __macro(cudnnConvolutionBackwardFilter);
-CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
+  __macro(cudnnSetTensor4dDescriptor);                     \
+  __macro(cudnnSetTensor4dDescriptorEx);                   \
+  __macro(cudnnSetTensorNdDescriptor);                     \
+  __macro(cudnnGetTensorNdDescriptor);                     \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);          \
+  __macro(cudnnCreateTensorDescriptor);                    \
+  __macro(cudnnDestroyTensorDescriptor);                   \
+  __macro(cudnnCreateFilterDescriptor);                    \
+  __macro(cudnnSetFilter4dDescriptor);                     \
+  __macro(cudnnSetFilterNdDescriptor);                     \
+  __macro(cudnnGetFilterNdDescriptor);                     \
+  __macro(cudnnSetPooling2dDescriptor);                    \
+  __macro(cudnnSetPoolingNdDescriptor);                    \
+  __macro(cudnnGetPoolingNdDescriptor);                    \
+  __macro(cudnnDestroyFilterDescriptor);                   \
+  __macro(cudnnCreateConvolutionDescriptor);               \
+  __macro(cudnnCreatePoolingDescriptor);                   \
+  __macro(cudnnDestroyPoolingDescriptor);                  \
+  __macro(cudnnSetConvolution2dDescriptor);                \
+  __macro(cudnnDestroyConvolutionDescriptor);              \
+  __macro(cudnnSetConvolutionNdDescriptor);                \
+  __macro(cudnnGetConvolutionNdDescriptor);                \
+  __macro(cudnnDeriveBNTensorDescriptor);                  \
+  __macro(cudnnCreateSpatialTransformerDescriptor);        \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);         \
+  __macro(cudnnDestroySpatialTransformerDescriptor);       \
+  __macro(cudnnSpatialTfGridGeneratorForward);             \
+  __macro(cudnnSpatialTfGridGeneratorBackward);            \
+  __macro(cudnnSpatialTfSamplerForward);                   \
+  __macro(cudnnSpatialTfSamplerBackward);                  \
+  __macro(cudnnCreate);                                    \
+  __macro(cudnnDestroy);                                   \
+  __macro(cudnnSetStream);                                 \
+  __macro(cudnnActivationForward);                         \
+  __macro(cudnnActivationBackward);                        \
+  __macro(cudnnConvolutionForward);                        \
+  __macro(cudnnConvolutionBackwardBias);                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);        \
+  __macro(cudnnTransformTensor);                           \
+  __macro(cudnnPoolingForward);                            \
+  __macro(cudnnPoolingBackward);                           \
+  __macro(cudnnSoftmaxBackward);                           \
+  __macro(cudnnSoftmaxForward);                            \
+  __macro(cudnnGetVersion);                                \
+  __macro(cudnnFindConvolutionForwardAlgorithmEx);         \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx);  \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);    \
+  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);    \
+  __macro(cudnnGetErrorString);                            \
+  __macro(cudnnCreateDropoutDescriptor);                   \
+  __macro(cudnnDropoutGetStatesSize);                      \
+  __macro(cudnnSetDropoutDescriptor);                      \
+  __macro(cudnnRestoreDropoutDescriptor);                  \
+  __macro(cudnnCreateRNNDescriptor);                       \
+  __macro(cudnnGetRNNParamsSize);                          \
+  __macro(cudnnGetRNNWorkspaceSize);                       \
+  __macro(cudnnGetRNNTrainingReserveSize);                 \
+  __macro(cudnnRNNForwardTraining);                        \
+  __macro(cudnnRNNBackwardData);                           \
+  __macro(cudnnRNNBackwardWeights);                        \
+  __macro(cudnnRNNForwardInference);                       \
+  __macro(cudnnDestroyDropoutDescriptor);                  \
+  __macro(cudnnDestroyRNNDescriptor);                      \
+  __macro(cudnnSetTensorNdDescriptorEx);                   \
+  __macro(cudnnAddTensor);                                 \
+  __macro(cudnnConvolutionBackwardData);                   \
+  __macro(cudnnConvolutionBackwardFilter);                 \
   __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);   \
+  __macro(cudnnBatchNormalizationForwardTraining);         \
+  __macro(cudnnBatchNormalizationForwardInference);        \
+  __macro(cudnnBatchNormalizationBackward);                \
+  __macro(cudnnCreateActivationDescriptor);                \
+  __macro(cudnnSetActivationDescriptor);                   \
+  __macro(cudnnGetActivationDescriptor);                   \
+  __macro(cudnnDestroyActivationDescriptor);               \
+  __macro(cudnnSetRNNDescriptor_v6);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
+#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
   __macro(cudnnGetConvolutionForwardAlgorithm);          \
   __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
   __macro(cudnnSetRNNDescriptor);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs available after R4:
-#if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
-  __macro(cudnnBatchNormalizationForwardTraining);  \
-  __macro(cudnnBatchNormalizationForwardInference); \
-  __macro(cudnnBatchNormalizationBackward);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs in R5
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
-  __macro(cudnnCreateActivationDescriptor); \
-  __macro(cudnnSetActivationDescriptor);    \
-  __macro(cudnnGetActivationDescriptor);    \
-  __macro(cudnnDestroyActivationDescriptor);
-CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs in R6
-#if CUDNN_VERSION >= 6000
-#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6);
-CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #if CUDNN_VERSION >= 7001
@@ -214,3 +187,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index fbdfc4928cf143e98e068d4a0649234c512346ea..be9cda4a2e9b6c709c96ee4ffe005597b9f44c24 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -36,6 +36,13 @@ DEFINE_string(nccl_dir, "",
               "For instance, /usr/local/cuda/lib64. If default, "
               "dlopen will search cuda from LD_LIBRARY_PATH");
 
+DEFINE_string(hccl_dir, "",
+              "Specify path for loading hccl library, such as libhccl.so. "
+              "For instance, "
+              "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
+              "default, "
+              "dlopen will search hccl from LD_LIBRARY_PATH");
+
 DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 
 DEFINE_string(
@@ -93,6 +100,9 @@ static constexpr char* win_cublas_lib =
 static constexpr char* win_curand_lib =
     "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
+static constexpr char* win_nvjpeg_lib =
+    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
 static constexpr char* win_cusolver_lib =
     "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll";
@@ -100,6 +110,9 @@ static constexpr char* win_cusolver_lib =
 static constexpr char* win_curand_lib =
     "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_nvjpeg_lib =
+    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
 static constexpr char* win_cusolver_lib =
     "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
@@ -323,6 +336,17 @@ void* GetCurandDsoHandle() {
 #endif
 }
 
+void* GetNvjpegDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_nvjpeg_lib, true,
+                                    {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
+#endif
+}
+
 void* GetCusolverDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
@@ -392,6 +416,24 @@ void* GetNCCLDsoHandle() {
                                     warning_msg);
 #endif
 }
+void* GetHCCLDsoHandle() {
+  std::string warning_msg(
+      "You may need to install 'hccl2' from Huawei official website: "
+      "before install PaddlePaddle.");
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
+                                    warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
+                                    warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
+                                    warning_msg);
+#endif
+}
 
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
@@ -414,12 +456,7 @@ void* GetMKLMLDsoHandle() {
 }
 
 void* GetOpDsoHandle(const std::string& dso_name) {
-#if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "Create custom cpp op outside framework do not support Apple."));
-#else
   return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
-#endif
 }
 
 void* GetNvtxDsoHandle() {
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index c3f5953c7857913980bca07d9fcffe459e483cfa..9ab6dca0126bcbdd02625e2f263ad7c466b5e966 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -29,11 +29,13 @@ void* GetCublasDsoHandle();
 void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();
 void* GetCurandDsoHandle();
+void* GetNvjpegDsoHandle();
 void* GetCusolverDsoHandle();
 void* GetNVRTCDsoHandle();
 void* GetCUDADsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
+void* GetHCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
diff --git a/paddle/fluid/platform/dynload/hccl.cc b/paddle/fluid/platform/dynload/hccl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5efac7691eb98b836cd77d663c6aca08ef9b938d
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hccl.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include "paddle/fluid/platform/dynload/hccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag hccl_dso_flag;
+void *hccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+#if HCCL_VERSION_CODE >= 2212
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/hccl.h b/paddle/fluid/platform/dynload/hccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..a56180ce2d4ca56b7ffbce9b8c384a1cd72d21d4
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hccl.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag hccl_dso_flag;
+extern void* hccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using HCCL_func = decltype(&::__name);                             \
+      std::call_once(hccl_dso_flag, []() {                               \
+        hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \
+      });                                                                \
+      static void* p_##__name = dlsym(hccl_dso_handle, #__name);         \
+      return reinterpret_cast<HCCL_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
+  extern DynLoad__##__name __name
+
+#define HCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(HcclReduceScatter);           \
+  __macro(HcclCommDestroy);             \
+  __macro(HcclAllReduce);               \
+  __macro(HcclCommInitRootInfo);        \
+  __macro(HcclGetRootInfo);             \
+  __macro(HcclBroadcast);               \
+  __macro(HcclCommInitClusterInfo);     \
+  __macro(HcclAllGather);
+
+HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+
+#if HCCL_VERSION_CODE >= 2212
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast);
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(HCCLSend);                               \
+  __macro(HCCLRecv);
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 43a3e1a1079d93183768178ac967c8f03eeb3c23..77ff3f3ccbbb6ee395beae7f4f8cb270b714a961 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -78,6 +78,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  **/
 #define MIOPEN_DNN_ROUTINE_EACH(__macro)                  \
   __macro(miopenGetVersion);                              \
+  __macro(miopenOpTensor);                                \
   __macro(miopenSet4dTensorDescriptor);                   \
   __macro(miopenSetTensorDescriptor);                     \
   __macro(miopenInitConvolutionNdDescriptor);             \
@@ -109,6 +110,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(miopenActivationBackward);                      \
   __macro(miopenConvolutionBackwardWeights);              \
   __macro(miopenConvolutionForward);                      \
+  __macro(miopenConvolutionForwardBias);                  \
   __macro(miopenConvolutionBackwardBias);                 \
   __macro(miopenConvolutionForwardGetWorkSpaceSize);      \
   __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \
@@ -116,7 +118,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(miopenPoolingForward);                          \
   __macro(miopenPoolingBackward);                         \
   __macro(miopenSoftmaxBackward);                         \
+  __macro(miopenSoftmaxBackward_V2);                      \
   __macro(miopenSoftmaxForward);                          \
+  __macro(miopenSoftmaxForward_V2);                       \
   __macro(miopenCreateDropoutDescriptor);                 \
   __macro(miopenDestroyDropoutDescriptor);                \
   __macro(miopenRestoreDropoutDescriptor);                \
@@ -125,6 +129,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(miopenCreateRNNDescriptor);                     \
   __macro(miopenDestroyRNNDescriptor);                    \
   __macro(miopenSetRNNDescriptor);                        \
+  __macro(miopenSetRNNDescriptor_V2);                     \
   __macro(miopenGetRNNParamsSize);                        \
   __macro(miopenGetRNNWorkspaceSize);                     \
   __macro(miopenGetRNNTrainingReserveSize);               \
diff --git a/paddle/fluid/platform/dynload/nvjpeg.cc b/paddle/fluid/platform/dynload/nvjpeg.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb0ad78b9b73cd38e2d6dd1f58433da41094dd3f
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nvjpeg.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/nvjpeg.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag nvjpeg_dso_flag;
+void *nvjpeg_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae457b2958f5deff9d879b012a0e06108d86c830
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nvjpeg.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <nvjpeg.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag nvjpeg_dso_flag;
+extern void *nvjpeg_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    nvjpegStatus_t operator()(Args... args) {                                \
+      using nvjpegFunc = decltype(&::__name);                                \
+      std::call_once(nvjpeg_dso_flag, []() {                                 \
+        nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \
+      });                                                                    \
+      static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name);           \
+      return reinterpret_cast<nvjpegFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
+  extern DynLoad__##__name __name
+
+#define NVJPEG_RAND_ROUTINE_EACH(__macro) \
+  __macro(nvjpegCreateSimple);            \
+  __macro(nvjpegJpegStateCreate);         \
+  __macro(nvjpegGetImageInfo);            \
+  __macro(nvjpegJpegStateDestroy);        \
+  __macro(nvjpegDecode);
+
+NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index e72fbd246cf05ddc4f8e9455ba9c579835faa5bd..1d105a1fd8682552b5b8e375e9d94206fe84ee98 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -27,7 +27,8 @@ void* tensorrt_plugin_dso_handle;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
-TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(DEFINE_WRAP);
 TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
 void* GetDsoHandle(const std::string& dso_name) {
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index e9bea9af9ca6e0ff90bfeff609cae00a4c58f568..bc29a0472041afbbff84fa346f4dd0f1535925b6 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -37,7 +37,7 @@ void* GetTensorRtPluginHandle();
 extern std::once_flag tensorrt_plugin_dso_flag;
 extern void* tensorrt_plugin_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name)                    \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
     void* operator()(Args... args) {                                          \
@@ -55,6 +55,23 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)                \
+  struct DynLoad__##__name {                                                  \
+    template <typename... Args>                                               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
+      std::call_once(tensorrt_dso_flag, []() {                                \
+        tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
+      });                                                                     \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
+      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                     \
+                              platform::errors::Unavailable(                  \
+                                  "Load tensorrt api %s failed", #__name));   \
+      using tensorrt_func = decltype(&::__name);                              \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
+    }                                                                         \
+  };                                                                          \
+  extern DynLoad__##__name __name
+
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
@@ -76,20 +93,25 @@ extern void* tensorrt_plugin_dso_handle;
 #ifdef NV_TENSORRT_MAJOR
 
 #if (NV_TENSORRT_MAJOR >= 6)
-#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
-  __macro(createInferBuilder_INTERNAL);     \
-  __macro(createInferRuntime_INTERNAL);     \
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
+  __macro(createInferRuntime_INTERNAL);             \
   __macro(getPluginRegistry);
 #else
-#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
-  __macro(createInferBuilder_INTERNAL);     \
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
   __macro(createInferRuntime_INTERNAL);
 #endif
 
+#define TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(__macro) \
+  __macro(getInferLibVersion);
+
 #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
   __macro(initLibNvInferPlugins);
 
-TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
+    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
 TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
 
 #endif  // end of NV_TENSORRT_MAJOR
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..0db4cc71b1b21085513c4703475e651b8d8edd74
--- /dev/null
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -0,0 +1,435 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/hostdevice.h"
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace Eigen {
+
+using complex64 = paddle::platform::complex64;
+using complex128 = paddle::platform::complex128;
+using float16 = paddle::platform::float16;
+
+template <typename T>
+struct NumTraits;
+
+template <>
+struct NumTraits<paddle::platform::bfloat16>
+    : GenericNumTraits<paddle::platform::bfloat16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline paddle::platform::bfloat16 epsilon() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x3400);
+  }
+  HOSTDEVICE static inline paddle::platform::bfloat16 dummy_precision() {
+    return paddle::platform::bfloat16(1e-5f);
+  }
+  HOSTDEVICE static inline paddle::platform::bfloat16 highest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f7f);
+  }
+  HOSTDEVICE static inline paddle::platform::bfloat16 lowest() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xff7f);
+  }
+  HOSTDEVICE static inline paddle::platform::bfloat16 infinity() {
+    return paddle::platform::raw_uint16_to_bfloat16(0x7f80);
+  }
+  HOSTDEVICE static inline paddle::platform::bfloat16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_bfloat16(0xffc1);
+  }
+};
+
+template <>
+struct NumTraits<complex64> : GenericNumTraits<std::complex<float>> {
+  typedef float Real;
+  typedef typename NumTraits<float>::Literal Literal;
+  enum {
+    IsComplex = 1,
+    RequireInitialization = NumTraits<float>::RequireInitialization,
+    ReadCost = 2 * NumTraits<float>::ReadCost,
+    AddCost = 2 * NumTraits<Real>::AddCost,
+    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
+  };
+
+  EIGEN_DEVICE_FUNC
+  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC
+  static inline Real dummy_precision() {
+    return NumTraits<Real>::dummy_precision();
+  }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
+};
+
+template <>
+struct NumTraits<complex128> : GenericNumTraits<std::complex<double>> {
+  typedef double Real;
+  typedef typename NumTraits<double>::Literal Literal;
+  enum {
+    IsComplex = 1,
+    RequireInitialization = NumTraits<double>::RequireInitialization,
+    ReadCost = 2 * NumTraits<double>::ReadCost,
+    AddCost = 2 * NumTraits<Real>::AddCost,
+    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
+  };
+
+  EIGEN_DEVICE_FUNC
+  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC
+  static inline Real dummy_precision() {
+    return NumTraits<Real>::dummy_precision();
+  }
+  EIGEN_DEVICE_FUNC
+  static inline int digits10() { return NumTraits<Real>::digits10(); }
+};
+
+template <>
+struct NumTraits<float16> : GenericNumTraits<float16> {
+  enum {
+    IsSigned = true,
+    IsInteger = false,
+    IsComplex = false,
+    RequireInitialization = false
+  };
+
+  HOSTDEVICE static inline float16 epsilon() {
+    return paddle::platform::raw_uint16_to_float16(0x0800);
+  }
+  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
+  HOSTDEVICE static inline float16 highest() {
+    return paddle::platform::raw_uint16_to_float16(0x7bff);
+  }
+  HOSTDEVICE static inline float16 lowest() {
+    return paddle::platform::raw_uint16_to_float16(0xfbff);
+  }
+  HOSTDEVICE static inline float16 infinity() {
+    return paddle::platform::raw_uint16_to_float16(0x7c00);
+  }
+  HOSTDEVICE static inline float16 quiet_NaN() {
+    return paddle::platform::raw_uint16_to_float16(0x7c01);
+  }
+};
+
+namespace numext {
+
+//////////// bfloat methods /////////////
+
+template <>
+HOSTDEVICE inline bool(isnan)(const paddle::platform::bfloat16& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const paddle::platform::bfloat16& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const paddle::platform::bfloat16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 exp(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::expf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 erf(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::erff(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 log(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 tanh(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 sqrt(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 ceil(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 floor(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 round(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 pow(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return paddle::platform::bfloat16(
+      ::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 abs(
+    const paddle::platform::bfloat16& a) {
+  return paddle::platform::bfloat16(::fabs(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 mini(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline paddle::platform::bfloat16 maxi(
+    const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) {
+  return a < b ? b : a;
+}
+
+//////////// complex64 methods /////////////
+
+template <>
+HOSTDEVICE inline bool(isnan)(const complex64& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const complex64& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const complex64& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline complex64 exp(const complex64& a) {
+  float com = ::expf(a.real);
+  float res_real = com * ::cosf(a.imag);
+  float res_imag = com * ::sinf(a.imag);
+  return complex64(res_real, res_imag);
+}
+
+template <>
+HOSTDEVICE inline complex64 log(const complex64& a) {
+  return paddle::platform::log(a);
+}
+
+template <>
+HOSTDEVICE inline complex64 tanh(const complex64& a) {
+  return paddle::platform::tanh(a);
+}
+
+template <>
+HOSTDEVICE inline complex64 sqrt(const complex64& a) {
+  return paddle::platform::sqrt(a);
+}
+
+template <>
+HOSTDEVICE inline complex64 ceil(const complex64& a) {
+  return complex64(::ceilf(a.real), ::ceilf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex64 floor(const complex64& a) {
+  return complex64(::floorf(a.real), ::floor(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex64 round(const complex64& a) {
+  return complex64(::roundf(a.real), ::roundf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex64 pow(const complex64& a, const complex64& b) {
+  return paddle::platform::pow(a, b);
+}
+
+template <>
+HOSTDEVICE inline float abs(const complex64& a) {
+  return paddle::platform::abs(a);
+}
+
+//////////// complex128 methods /////////////
+
+template <>
+HOSTDEVICE inline bool(isnan)(const complex128& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const complex128& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const complex128& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline complex128 exp(const complex128& a) {
+  double com = ::expf(a.real);
+  double res_real = com * ::cosf(a.imag);
+  double res_imag = com * ::sinf(a.imag);
+  return complex128(res_real, res_imag);
+}
+
+template <>
+HOSTDEVICE inline complex128 log(const complex128& a) {
+  return paddle::platform::log(a);
+}
+
+template <>
+HOSTDEVICE inline complex128 tanh(const complex128& a) {
+  return paddle::platform::tanh(a);
+}
+
+template <>
+HOSTDEVICE inline complex128 sqrt(const complex128& a) {
+  return paddle::platform::sqrt(a);
+}
+
+template <>
+HOSTDEVICE inline complex128 ceil(const complex128& a) {
+  return complex128(::ceilf(a.real), ::ceilf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex128 floor(const complex128& a) {
+  return complex128(::floorf(a.real), ::floor(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex128 round(const complex128& a) {
+  return complex128(::roundf(a.real), ::roundf(a.imag));
+}
+
+template <>
+HOSTDEVICE inline complex128 pow(const complex128& a, const complex128& b) {
+  return paddle::platform::pow(a, b);
+}
+
+template <>
+HOSTDEVICE inline double abs(const complex128& a) {
+  return paddle::platform::abs(a);
+}
+
+//////////// float16 methods /////////////
+
+template <>
+HOSTDEVICE inline bool(isnan)(const float16& a) {
+  return (paddle::platform::isnan)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isinf)(const float16& a) {
+  return (paddle::platform::isinf)(a);
+}
+
+template <>
+HOSTDEVICE inline bool(isfinite)(const float16& a) {
+  return (paddle::platform::isfinite)(a);
+}
+
+template <>
+HOSTDEVICE inline float16 exp(const float16& a) {
+  return float16(::expf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 erf(const float16& a) {
+  return float16(::erff(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 log(const float16& a) {
+  return float16(::logf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 tanh(const float16& a) {
+  return float16(::tanhf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 sqrt(const float16& a) {
+  return float16(::sqrtf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 ceil(const float16& a) {
+  return float16(::ceilf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 floor(const float16& a) {
+  return float16(::floorf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 round(const float16& a) {
+  return float16(::roundf(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
+  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
+}
+
+template <>
+HOSTDEVICE inline float16 abs(const float16& a) {
+  return float16(::fabs(static_cast<float>(a)));
+}
+
+template <>
+HOSTDEVICE inline float16 mini(const float16& a, const float16& b) {
+  return b < a ? b : a;
+}
+
+template <>
+HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) {
+  return a < b ? b : a;
+}
+
+}  // namespace numext
+}  // namespace Eigen
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 47ade89ff2df3f53e3d99fbe1b203314b9edabd2..d42733823e669b03daa8f29dfa0c40be38de1069 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -45,6 +45,11 @@ limitations under the License. */
 #include <thrust/system_error.h>  // NOLINT
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "acl/acl.h"
+#include "hccl/hccl_types.h"
+#endif  // PADDLE_WITH_ASCEND_CL
+
 #include <fstream>
 #include <iomanip>
 #include <memory>
@@ -970,7 +975,6 @@ DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
 #endif
-
 }  // namespace details
 
 #define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                        \
@@ -987,6 +991,16 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
     }                                                            \
   } while (0)
 
+#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP)                                 \
+  do {                                                                         \
+    auto res = cudaGetLastError();                                             \
+    if (UNLIKELY(res != cudaSuccess)) {                                        \
+      auto msg = ::paddle::platform::build_nvidia_error_msg(res);              \
+      PADDLE_THROW(platform::errors::Fatal("CUDA error after kernel (%s): %s", \
+                                           OP, msg));                          \
+    }                                                                          \
+  } while (0)
+
 inline void retry_sleep(unsigned milliseconds) {
 #ifdef _WIN32
   Sleep(milliseconds);
@@ -1204,5 +1218,48 @@ inline void retry_sleep(unsigned millisecond) {
 #undef DEFINE_CUDA_STATUS_TYPE
 #endif  // PADDLE_WITH_HIP
 
+#ifdef PADDLE_WITH_ASCEND_CL
+namespace details {
+template <typename T>
+struct NPUStatusType {};
+
+#define DEFINE_NPU_STATUS_TYPE(type, success_value) \
+  template <>                                       \
+  struct NPUStatusType<type> {                      \
+    using Type = type;                              \
+    static constexpr Type kSuccess = success_value; \
+  }
+
+DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
+DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS);
+}  // namespace details
+
+inline std::string build_npu_error_msg(aclError stat) {
+  std::ostringstream sout;
+  sout << " ACL error, the error code is : " << stat << ". ";
+  return sout.str();
+}
+
+inline std::string build_npu_error_msg(HcclResult stat) {
+  std::ostringstream sout;
+  sout << " HCCL error, the error code is : " << stat << ". ";
+  return sout.str();
+}
+
+#define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \
+  do {                                                         \
+    auto __cond__ = (COND);                                    \
+    using __NPU_STATUS_TYPE__ = decltype(__cond__);            \
+    constexpr auto __success_type__ =                          \
+        ::paddle::platform::details::NPUStatusType<            \
+            __NPU_STATUS_TYPE__>::kSuccess;                    \
+    if (UNLIKELY(__cond__ != __success_type__)) {              \
+      auto __summary__ = ::paddle::platform::errors::External( \
+          ::paddle::platform::build_npu_error_msg(__cond__));  \
+      __THROW_ERROR_INTERNAL__(__summary__);                   \
+    }                                                          \
+  } while (0)
+#endif  // PADDLE_WITH_ASCEND_CL
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 1a55562f2b82441f11594d35d53d014dcf93a361..1d76c2ea584b7e393da2bee6e0dd41731463eb81 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -45,7 +45,10 @@ DEFINE_bool(check_nan_inf, false,
             "Checking whether operator produce NAN/INF or not. It will be "
             "extremely slow so please use this flag wisely.");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
+// flags.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 
 /**
  * CUDA related related FLAG
@@ -84,8 +87,15 @@ DEFINE_string(selected_gpus, "",
               "share-memory only.");
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_ASCEND_CL)
+DEFINE_string(selected_npus, "",
+              "A list of device ids separated by comma, like: 0,1,2,3. "
+              "This option is useful when doing multi process training and "
+              "each process have only one device (NPU). If you want to use "
+              "all visible devices, set this to empty string.");
+#endif
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 /**
  * CUDNN related FLAG
  * Name: FLAGS_cudnn_deterministic
@@ -377,7 +387,10 @@ DEFINE_double(
     "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle,"
     "reserve the rest for page tables, etc");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+// NOTE(zhiqiu): better to share the flags, otherwise we will have too many
+// flags.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 
 /**
  * Memory related FLAG
@@ -564,3 +577,28 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
  */
 DEFINE_string(tracer_mkldnn_ops_off, "",
               "List of OneDNN operation types to be turned off");
+
+/**
+ * Debug related FLAG
+ * Name: check_kernel_launch
+ * Since Version: 2.1.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Check kernel launch status after every kernel compute.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DEFINE_bool(check_kernel_launch, false,
+            "Check kernel launch status after every kernel compute");
+#endif
+
+/**
+ * CUDNN related FLAG
+ * Name: conv2d_disable_cudnn
+ * Since Version:
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Disable cudnn in conv2d.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
+#endif
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index df2a24400b438bb2c1bc1b84e44a8068b0ef54c4..bdd4d54b3d1a16014f4783ed7437ef905b488f27 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
+
+#include <cmath>
+#include <iostream>
 #include <limits>
 
 #ifdef PADDLE_WITH_CUDA
@@ -25,18 +28,6 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef __GNUC__
-#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
-#else
-#define PADDLE_GNUC_VER 0
-#endif  // __GNUC__
-
-#ifdef __clang__
-#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
-#else
-#define PADDLE_CLANG_VER 0
-#endif  // __clang__
-
 #if defined(__CUDACC__) && CUDA_VERSION >= 7050
 #define PADDLE_CUDA_FP16
 #include <cuda_fp16.h>
@@ -55,17 +46,15 @@ limitations under the License. */
 
 #define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
 
-namespace paddle {
-namespace platform {
-
-// Forward declare float16 for eigen.h
-struct float16;
-
-}  // namespace platform
-}  // namespace paddle
-
-#include "paddle/fluid/platform/hostdevice.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#if (defined(__CUDACC__) || defined(__HIPCC__))
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
 
 namespace paddle {
 namespace platform {
@@ -73,7 +62,7 @@ namespace platform {
 // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
 // and aligned at least on a 2-byte boundary, which leads to efficient
 // memory access of float16 struct and also makes float16 compatible
-// with CUDA half, ARM float16_t, and Eigen::half data types.
+// with CUDA half, ARM float16_t data types.
 struct PADDLE_ALIGN(2) float16 {
  public:
   uint16_t x;
@@ -100,8 +89,6 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
-
 #ifdef PADDLE_WITH_NATIVE_FP16
   // __fp16 is a native half precision data type for arm cpu,
   // float16_t is an alias for __fp16
@@ -163,11 +150,6 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif
 
-  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
-    x = rhs.x;
-    return *this;
-  }
-
 #ifdef PADDLE_WITH_NATIVE_FP16
   HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
     x = *reinterpret_cast<const uint16_t*>(&rhs);
@@ -245,12 +227,6 @@ struct PADDLE_ALIGN(2) float16 {
   }
 #endif  // PADDLE_CUDA_FP16
 
-  HOSTDEVICE inline explicit operator Eigen::half() const {
-    Eigen::half h;
-    h.x = x;
-    return h;
-  }
-
 #ifdef PADDLE_WITH_NATIVE_FP16
   HOSTDEVICE inline explicit operator float16_t() const {
     return *reinterpret_cast<const float16_t*>(this);
@@ -1108,105 +1084,3 @@ HOSTDEVICE inline paddle::platform::float16 abs(
 }
 
 }  // namespace std
-
-namespace Eigen {
-
-using float16 = paddle::platform::float16;
-
-template <>
-struct NumTraits<float16> : GenericNumTraits<float16> {
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
-
-  HOSTDEVICE static inline float16 epsilon() {
-    return paddle::platform::raw_uint16_to_float16(0x0800);
-  }
-  HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); }
-  HOSTDEVICE static inline float16 highest() {
-    return paddle::platform::raw_uint16_to_float16(0x7bff);
-  }
-  HOSTDEVICE static inline float16 lowest() {
-    return paddle::platform::raw_uint16_to_float16(0xfbff);
-  }
-  HOSTDEVICE static inline float16 infinity() {
-    return paddle::platform::raw_uint16_to_float16(0x7c00);
-  }
-  HOSTDEVICE static inline float16 quiet_NaN() {
-    return paddle::platform::raw_uint16_to_float16(0x7c01);
-  }
-};
-
-namespace numext {
-
-template <>
-HOSTDEVICE inline bool(isnan)(const float16& a) {
-  return (paddle::platform::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const float16& a) {
-  return (paddle::platform::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const float16& a) {
-  return (paddle::platform::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline float16 exp(const float16& a) {
-  return float16(::expf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 erf(const float16& a) {
-  return float16(::erff(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 log(const float16& a) {
-  return float16(::logf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 tanh(const float16& a) {
-  return float16(::tanhf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 sqrt(const float16& a) {
-  return float16(::sqrtf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 ceil(const float16& a) {
-  return float16(::ceilf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 floor(const float16& a) {
-  return float16(::floorf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 round(const float16& a) {
-  return float16(::roundf(static_cast<float>(a)));
-}
-
-template <>
-HOSTDEVICE inline float16 pow(const float16& a, const float16& b) {
-  return float16(::powf(static_cast<float>(a), static_cast<float>(b)));
-}
-
-template <>
-HOSTDEVICE inline float16 abs(const float16& a) {
-  return float16(::fabs(static_cast<float>(a)));
-}
-
-}  // namespace numext
-
-}  // namespace Eigen
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index f607988d920243a3d863e1386c2ae470f7c8dddf..56633a35116719b617ac7b2dbc6aeac381a251d6 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -8,26 +8,19 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+
 #include "paddle/fluid/platform/float16.h"
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace platform {
 
 TEST(float16, conversion_cpu) {
-  // Explicit conversion from Eigen::half
-  EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
-  EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
-  EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
-  EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
-  EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
-  EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
-  EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
-
   // Conversion from float
   EXPECT_EQ(float16(1.0f).x, 0x3c00);
   EXPECT_EQ(float16(0.5f).x, 0x3800);
@@ -61,8 +54,6 @@ TEST(float16, conversion_cpu) {
   float16 v_assign;
   v_assign = float16(0);
   EXPECT_EQ(v_assign.x, 0x0000);
-  v_assign = Eigen::half(1.0f);
-  EXPECT_EQ(v_assign.x, 0x3c00);
   v_assign = 0.5f;
   EXPECT_EQ(v_assign.x, 0x3800);
   v_assign = 0.33333;
@@ -73,7 +64,6 @@ TEST(float16, conversion_cpu) {
   EXPECT_EQ(v_assign.x, 0x3c00);
 
   // Conversion operator
-  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
   EXPECT_EQ(static_cast<float>(float16(0.5f)), 0.5f);
   EXPECT_NEAR(static_cast<double>(float16(0.33333)), 0.33333, 0.0001);
   EXPECT_EQ(static_cast<int>(float16(-1)), -1);
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 527da790414b140f076409f0a789b1c51cbaaae1..75e35d398c27e718f5f1e568b2082fb3439aa233 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
@@ -196,8 +197,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-#if defined(PADDLE_WITH_HIP) || \
-    (defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000)
+#if defined(PADDLE_WITH_HIP)
 ARITHMETIC_KERNEL(Add, +)
 ARITHMETIC_KERNEL(Sub, -)
 ARITHMETIC_KERNEL(Mul, *)
diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc
index ffe82371b18e675d801ab25745728a44cd18ddfe..f38603e80fb115f3131173c36f0ee2962d06c0de 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -36,6 +36,8 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+std::once_flag SocketServer::init_flag_;
+
 constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
 
 // Check system calls, such as socket, bind.
@@ -330,6 +332,22 @@ void RecvBroadCastCommID(int server_fd, std::string endpoint,
   CloseSocket(client);
 }
 
+SocketServer& SocketServer::GetInstance(const std::string& end_point) {
+  static SocketServer instance;
+  std::call_once(init_flag_, [&]() {
+    instance.server_fd_ = CreateListenSocket(end_point);
+    instance.end_point_ = end_point;
+  });
+  PADDLE_ENFORCE_NE(instance.server_fd_, -1,
+                    platform::errors::Unavailable(
+                        "listen socket failed with end_point=%s", end_point));
+  PADDLE_ENFORCE_EQ(instance.end_point_, end_point,
+                    platform::errors::InvalidArgument(
+                        "old end_point=%s must equal with new end_point=%s",
+                        instance.end_point_, end_point));
+  return instance;
+}
+
 /// template instantiation
 #define INSTANT_TEMPLATE(Type)                                              \
   template void SendBroadCastCommID<Type>(std::vector<std::string> servers, \
diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h
index 6014a2b4ff98d24e0fc1424c9337389a7b01cc45..c51c5ac6c8ac7bc8a8887c39c0b08d8cd0af4540 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.h
+++ b/paddle/fluid/platform/gen_comm_id_helper.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
 #include <functional>
+#include <memory>
+#include <mutex>
 #include <string>
 #include <vector>
 
@@ -39,6 +41,24 @@ void RecvBroadCastCommID(std::string endpoint,
 template <typename CommUniqueId>
 void RecvBroadCastCommID(int server_fd, std::string endpoint,
                          std::vector<CommUniqueId>* nccl_ids);
+
+class SocketServer {
+ public:
+  SocketServer() = default;
+
+  ~SocketServer() { CloseSocket(server_fd_); }
+
+  int socket() const { return server_fd_; }
+
+  static SocketServer& GetInstance(const std::string& end_point);
+
+ private:
+  int server_fd_{-1};
+  std::string end_point_;
+
+  static std::once_flag init_flag_;
+};
+
 }  // namespace platform
 }  // namespace paddle
 
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 3769428c9df86224abfdc4b67c066a3dc553d5c6..2e66e3e36d0b219301a1a8de0a9b495ad9026942 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -102,6 +102,7 @@ static int GetCUDADeviceCountImpl() {
 }
 
 int GetCUDADeviceCount() {
+  // cache the count
   static auto dev_cnt = GetCUDADeviceCountImpl();
   return dev_cnt;
 }
diff --git a/paddle/fluid/platform/hccl_helper.h b/paddle/fluid/platform/hccl_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..692f8dbe0bf1ee277727811ebe13355016bcaf76
--- /dev/null
+++ b/paddle/fluid/platform/hccl_helper.h
@@ -0,0 +1,355 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_HCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
+
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/dynload/hccl.h"
+#endif
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+#define HCCL_ID_VARNAME "HCCLID"
+
+namespace paddle {
+namespace platform {
+
+inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
+  if (type == framework::proto::VarType::FP32) {
+    return HCCL_DATA_TYPE_FP32;
+  } else if (type == framework::proto::VarType::FP16) {
+    return HCCL_DATA_TYPE_FP16;
+  } else if (type == framework::proto::VarType::INT32) {
+    return HCCL_DATA_TYPE_INT32;
+  } else if (type == framework::proto::VarType::INT8) {
+    return HCCL_DATA_TYPE_INT8;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in hccl is not supported."));
+  }
+}
+
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// HCCL actions when use it.
+
+// class HCCLGroupGuard {
+//  public:
+//   static std::mutex &HCCLMutex() {
+//     static std::mutex mtx;
+//     return mtx;
+//   }
+
+//   inline HCCLGroupGuard() {
+//     HCCLMutex().lock();
+//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+//   }
+
+//   inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
+//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+//     HCCLMutex().unlock();
+//   }
+// };
+
+struct HCCLContext {
+  std::unique_ptr<NPUDeviceContext> ctx_;
+  HcclComm comm_;
+
+  explicit HCCLContext(int dev_id)
+      : ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {}
+
+  aclrtStream stream() const { return ctx_->stream(); }
+  HcclComm comm() const { return comm_; }
+
+  int device_id() const {
+    return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device;
+  }
+};
+
+struct HCCLContextMap {
+  std::unordered_map<int, HCCLContext> contexts_;
+  std::vector<int> order_;
+
+  explicit HCCLContextMap(const std::vector<platform::Place> &places,
+                          HcclRootInfo *hccl_id = nullptr,
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
+    PADDLE_ENFORCE_EQ(!places.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The HCCL place should not be empty."));
+    order_.reserve(places.size());
+    for (auto &p : places) {
+      int dev_id = BOOST_GET_CONST(NPUPlace, p).device;
+      order_.emplace_back(dev_id);
+      contexts_.emplace(dev_id, HCCLContext(dev_id));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        platform::errors::Unavailable("HCCL Context Map does not support "
+                                      "contain two or more same device."));
+
+    std::unique_ptr<HcclComm[]> comms(new HcclComm[order_.size()]);
+    // if num_trainers == 1, should create a new nccl id for local comms.
+    if (num_trainers == 1 && hccl_id == nullptr) {
+      // we do not know how to tackle this situation under hccl
+      // std::lock_guard<std::mutex> guard(HCCLGroupGuard::HCCLMutex());
+      // PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll(
+      //     comms.get(), static_cast<int>(order_.size()), order_.data()));
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(hccl_id, platform::errors::InvalidArgument(
+                                           "The HCCL id should not be null."));
+      {
+        int nranks = num_trainers * order_.size();
+        // HCCLGroupGuard gurad;
+        for (size_t i = 0; i < order_.size(); ++i) {
+          int gpu_id = order_[i];
+          int rank;
+          if (order_.size() > 1) {
+            rank = trainer_id * order_.size() + i;
+          } else {
+            rank = trainer_id;
+          }
+          VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks
+                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
+          aclrtSetDevice(gpu_id);
+          PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo(
+              nranks, hccl_id, rank, comms.get() + i));
+        }
+      }
+    }
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
+  }
+
+  HCCLContextMap(const HCCLContextMap &other) = delete;
+  HCCLContextMap &operator=(const HCCLContextMap &other) = delete;
+
+  NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+
+  NPUDeviceContext *DevCtx(platform::Place p) const {
+    return DevCtx(BOOST_GET_CONST(NPUPlace, p).device);
+  }
+
+  const HCCLContext &at(platform::Place p) const {
+    return this->at(BOOST_GET_CONST(NPUPlace, p).device);
+  }
+
+  const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
+
+  void WaitAll() {
+    for (auto &p : contexts_) {
+      p.second.ctx_->Wait();
+    }
+  }
+};
+
+inline std::string GetFlatHCCLVarName(size_t pos) {
+  if (pos == 0) {
+    return HCCL_ID_VARNAME;
+  }
+  return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
+}
+
+inline std::string GetHierarchicalExterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_exter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+inline std::string GetHierarchicalInterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_inter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+
+class HCCLCommunicator {
+ public:
+  HCCLCommunicator() {}
+  virtual ~HCCLCommunicator() PADDLE_MAY_THROW {}
+
+  HCCLContextMap *DefaultFlatCtx() const {
+    if (flat_ctxs_.size() == 0) {
+      return nullptr;
+    }
+
+    return flat_ctxs_[0].get();
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetFlatCtxs() {
+    return &flat_ctxs_;
+  }
+
+  HCCLContextMap *GetFlatCtx(size_t run_order) const {
+    return flat_ctxs_[run_order % flat_ctxs_.size()].get();
+  }
+
+  HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order,
+                                   bool use_hierarchical_allreduce) const {
+    if (!use_hierarchical_allreduce) {
+      return GetFlatCtx(run_order);
+    }
+
+    return GetHierarchicalInterCtx(run_order);
+  }
+
+  /*
+   When nccl inits nccl comm using ncclCommInitAll, it meets error when
+   allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
+   create a new nccl comm for sync_batch_norm_op. And these codes should be
+   polished with a unified nccl management.
+  */
+
+  HCCLContextMap *GetSyncBatchNormCtx(
+      framework::Scope *scope, const std::vector<platform::Place> &places) {
+    auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME);
+    if (hccl_id_var != nullptr) {
+      return DefaultFlatCtx();
+    }
+
+    if (sync_batch_norm_ctx_.get() == nullptr) {
+      sync_batch_norm_ctx_.reset(new HCCLContextMap(places));
+    }
+    return sync_batch_norm_ctx_.get();
+  }
+
+  void InitFlatCtxs(const std::vector<platform::Place> &places,
+                    const std::vector<HcclRootInfo *> &hccl_ids,
+                    size_t trainers_num, size_t trainer_id) {
+    if (hccl_ids.size() == 0) {
+      auto ptr = new platform::HCCLContextMap(places);
+      VLOG(1) << "init local trainer";
+      flat_ctxs_.emplace_back(ptr);
+    } else {
+      for (size_t i = 0; i < hccl_ids.size(); i++) {
+        auto ptr = new platform::HCCLContextMap(places, hccl_ids[i],
+                                                trainers_num, trainer_id);
+        VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
+        flat_ctxs_.emplace_back(ptr);
+      }
+    }
+
+    // as Executor have no way to use ncclComm created by ParallelExecutor,
+    // we assign all flatten contexts to HCCLCommContext to fix.
+    int nranks = static_cast<int>(trainers_num * places.size());
+    int nrings = static_cast<int>(flat_ctxs_.size());
+    for (int ring_id = 0; ring_id < nrings; ++ring_id) {
+      for (size_t p = 0; p < places.size(); ++p) {
+        int rank = trainer_id * places.size() + p;
+        int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device;
+        auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
+        HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank,
+                                                   dev_id, ring_id);
+      }
+    }
+  }
+
+  void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
+                            const std::vector<HcclRootInfo *> &inter_hccl_ids,
+                            const std::vector<HcclRootInfo *> &exter_hccl_ids,
+                            size_t trainers_num, size_t trainer_id,
+                            size_t inter_trainers_num,
+                            size_t exter_trainers_num) {
+    PADDLE_ENFORCE_EQ(
+        trainers_num, inter_trainers_num * exter_trainers_num,
+        platform::errors::InvalidArgument(
+            "trainers_num:%llu != inter_trainers_num:%llu * "
+            "exter_trainers_num:%llu",
+            trainers_num, inter_trainers_num, exter_trainers_num));
+
+    PADDLE_ENFORCE_GT(
+        inter_trainers_num, 1,
+        platform::errors::InvalidArgument(
+            "The inter_trainers_num:%llu should be larger than 1.",
+            inter_trainers_num));
+
+    int inter_trainer_id = trainer_id % inter_trainers_num;
+    for (size_t i = 0; i < inter_hccl_ids.size(); i++) {
+      VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
+              << ", comm no:" << i;
+      auto local = new HCCLContextMap(places, inter_hccl_ids[i],
+                                      inter_trainers_num, inter_trainer_id);
+
+      h_inter_ctxs_.emplace_back(local);
+    }
+
+    int exter_trainer_id = -1;
+    if (trainer_id % inter_trainers_num == 0) {
+      exter_trainer_id = trainer_id / inter_trainers_num;
+    }
+
+    if (exter_trainer_id >= 0) {
+      for (size_t i = 0; i < exter_hccl_ids.size(); i++) {
+        auto ex = new HCCLContextMap(places, exter_hccl_ids[i],
+                                     exter_trainers_num, exter_trainer_id);
+        VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
+                << ", comm no:" << i;
+        h_exter_ctxs_.emplace_back(ex);
+      }
+    }
+  }
+
+  bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
+
+  HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
+  }
+
+  HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalInterCtxs() {
+    return &h_inter_ctxs_;
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalExterCtxs() {
+    return &h_exter_ctxs_;
+  }
+
+ protected:
+  // Support multi nccl comm on default nccl ring while HCCLContextMap can't.
+  std::vector<std::unique_ptr<HCCLContextMap>> flat_ctxs_;
+
+  // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
+  // And h_exter_ctxs_ can support multi comm too.
+  std::vector<std::unique_ptr<HCCLContextMap>> h_inter_ctxs_;
+  std::vector<std::unique_ptr<HCCLContextMap>> h_exter_ctxs_;
+
+  // just used for sync_batch_norm op.
+  std::unique_ptr<HCCLContextMap> sync_batch_norm_ctx_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index ea89082733a80fa9e8e79129839f1120f344cc55..ac6988d350f4f38c6e8da2a655c29069b8d0eda6 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/string/split.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
@@ -63,6 +65,7 @@ namespace framework {
 
 std::once_flag gflags_init_flag;
 std::once_flag glog_init_flag;
+std::once_flag npu_init_flag;
 
 bool InitGflags(std::vector<std::string> args) {
   bool successed = false;
@@ -145,6 +148,17 @@ void InitDevices() {
   } catch (const std::exception &exp) {
     LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime.";
   }
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(zhiqiu): use singleton to explicitly init and finalize ACL
+  platform::AclInstance::Instance();  // NOLINT
+  try {
+    // use user specified XPUs in single-node multi-process mode.
+    devices = platform::GetSelectedNPUDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING)
+        << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime.";
+  }
 #endif
   InitDevices(devices);
 }
@@ -165,6 +179,9 @@ void InitDevices(const std::vector<int> devices) {
 #endif
 #ifdef PADDLE_WITH_XPU
     places.emplace_back(platform::XPUPlace(devices[i]));
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+    places.emplace_back(platform::NPUPlace(devices[i]));
 #endif
   }
   places.emplace_back(platform::CPUPlace());
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index 0be4233269e0f4f732c6b6a2622e7db3cb8e37e3..25ae0ab264f2d8cc044502673cc8b09f589308f9 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -27,24 +27,38 @@ namespace paddle {
 namespace platform {
 
 void PrintVar(framework::Scope* scope, const std::string& var_name,
-              const std::string& print_info) {
+              const std::string& print_info, std::stringstream* sstream) {
   framework::Variable* var = scope->FindVar(var_name);
   if (var == nullptr) {
-    VLOG(1) << "Variable Name " << var_name << " does not exist in your scope";
+    VLOG(0) << "Variable Name " << var_name << " does not exist in your scope";
     return;
   }
   framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
   if (tensor == nullptr) {
-    VLOG(1) << "tensor of variable " << var_name
+    VLOG(0) << "tensor of variable " << var_name
             << " does not exist in your scope";
     return;
   }
 
-  std::ostringstream sstream;
-  sstream << print_info << "\t";
-  sstream << var_name << "\t";
-  sstream << *tensor << "\t";
-  std::cout << sstream.str() << std::endl;
+  *sstream << print_info << ": ";
+
+#define PrintTensorCallback(cpp_type, proto_type) \
+  do {                                            \
+    if (tensor->type() == proto_type) {           \
+      *sstream << "[";                            \
+      auto* data = tensor->data<cpp_type>();      \
+      auto element_num = tensor->numel();         \
+      if (element_num > 0) {                      \
+        *sstream << data[0];                      \
+        for (int j = 1; j < element_num; ++j) {   \
+          *sstream << " " << data[j];             \
+        }                                         \
+      }                                           \
+      *sstream << "]";                            \
+    }                                             \
+  } while (0)
+
+  _ForEachDataType_(PrintTensorCallback);
 }
 
 }  // end namespace platform
diff --git a/paddle/fluid/platform/lodtensor_printer.h b/paddle/fluid/platform/lodtensor_printer.h
index e0bd1fff197f700303e3e7d9c9afe18937a1b9e6..d30afb62b0b8c6999972e81d396bd89053bf2d16 100644
--- a/paddle/fluid/platform/lodtensor_printer.h
+++ b/paddle/fluid/platform/lodtensor_printer.h
@@ -26,6 +26,6 @@ class Scope;
 namespace paddle {
 namespace platform {
 void PrintVar(framework::Scope* scope, const std::string& var_name,
-              const std::string& print_info);
+              const std::string& print_info, std::stringstream* out);
 }  // end namespace platform
 }  // end namespace paddle
diff --git a/paddle/fluid/platform/lodtensor_printer_test.cc b/paddle/fluid/platform/lodtensor_printer_test.cc
index 5b2af270740766307990c5cd53585a9c62606da3..51bd55ebb7f4889dda89cc3bcda5491514547188 100644
--- a/paddle/fluid/platform/lodtensor_printer_test.cc
+++ b/paddle/fluid/platform/lodtensor_printer_test.cc
@@ -18,5 +18,6 @@
 
 TEST(LodTensorPrinter, PrintVar) {
   paddle::framework::Scope scope;
-  paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var");
+  std::stringstream ss;
+  paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var", &ss);
 }
diff --git a/paddle/fluid/platform/miopen_desc.h b/paddle/fluid/platform/miopen_desc.h
index 7de713559ae410650f4722c89f39c9d803232e60..c82e61ceb122c033bb5bd800b41bd2c399020a69 100644
--- a/paddle/fluid/platform/miopen_desc.h
+++ b/paddle/fluid/platform/miopen_desc.h
@@ -199,19 +199,24 @@ class FilterDescriptor {
 
   void set(const Tensor& tensor, const miopenTensorFormat_t format,
            const int groups = 1) {
-    auto dims = framework::vectorize<int>(tensor.dims());
-    std::vector<int> transformed_dims;
     PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW,
                       platform::errors::InvalidArgument(
                           "format should ONLY be NCHW in MIOPEN."));
-    transformed_dims = dims;
-    // if (groups > 1) {
-    //   transformed_dims[1] = transformed_dims[1] / groups;
-    // }
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSet4dTensorDescriptor(
-        (miopenTensorDescriptor_t)desc_.get(), ToCudnnDataType(tensor.type()),
-        transformed_dims[0], transformed_dims[1], transformed_dims[2],
-        transformed_dims[3]));
+    auto dims = framework::vectorize<int>(tensor.dims());
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    std::vector<int> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor(
+        (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()),
+        static_cast<int>(dims_with_group.size()),
+        const_cast<int*>(dims_with_group.data()),
+        const_cast<int*>(strides.data())));
   }
 
  private:
diff --git a/paddle/fluid/platform/miopen_helper.h b/paddle/fluid/platform/miopen_helper.h
index 435d28d518df1adf3ed37841324100e0bfbffa88..46c7da839704196952deda6de6d09c7009ee6a78 100644
--- a/paddle/fluid/platform/miopen_helper.h
+++ b/paddle/fluid/platform/miopen_helper.h
@@ -434,9 +434,10 @@ class ScopedPoolingDescriptor {
             "The size of kernel and strides should be equal. But "
             "received size of kernel is %d, size of strides is %d.",
             kernel.size(), strides.size()));
-    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSet2dPoolingDescriptor(
-        desc_, GetPoolingMode(mode), kernel[0], kernel[1], pads[0], pads[1],
-        strides[0], strides[1]));
+    PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetNdPoolingDescriptor(
+        desc_, GetPoolingMode(mode), kernel.size(),
+        const_cast<int*>(kernel.data()), const_cast<int*>(pads.data()),
+        const_cast<int*>(strides.data())));
     return desc_;
   }
 
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 20e6dfe1c391673e29a55861979904e2f3cb655c..0b683a742c9fd8094e91c54d4f323120bad1eaca 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -135,13 +135,14 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int64_t>& dims,
   return mkldnn::memory::desc({dims}, data_type, format);
 }
 
-inline void ClearMKLDNNCache(const platform::Place& place) {
+inline void ClearMKLDNNCache(const platform::Place& place,
+                             void* ptr = nullptr) {
   // Clear mkl-dnn cache,
   if (platform::is_cpu_place(place)) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::MKLDNNDeviceContext* dev_ctx =
         (platform::MKLDNNDeviceContext*)pool.Get(place);
-    dev_ctx->ResetBlobMap();
+    dev_ctx->ResetBlobMap(ptr);
     platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
         paddle::framework::DataLayout::kNCHW);
   }
@@ -439,14 +440,26 @@ inline void AppendKey(std::string* key, const std::vector<T>& dims) {
 inline void AttachPointerHashToMKLDNNKey(void* ptr,
                                          const platform::Place& place) {
   if (platform::is_cpu_place(place)) {
-    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    platform::MKLDNNDeviceContext* dev_ctx =
-        (platform::MKLDNNDeviceContext*)pool.Get(place);
-    dev_ctx->SetKeySuffix("E" +
-                          std::to_string(reinterpret_cast<uintptr_t>(ptr)));
-    // When NaiveExecutor/Executor is used no info on thread id is needed in a
-    // key
-    dev_ctx->DisableThreadInfoInKey();
+    // Static vars will remember first executor and its thread
+    // so both of them need to be processed by the same thread within
+    // critical section
+    static std::mutex static_vars_barrier;
+    static_vars_barrier.lock();
+    static auto first_exec = ptr;
+    static auto first_thread = ThreadIDasStr();
+    static_vars_barrier.unlock();
+
+    if (first_exec != ptr) {
+      paddle::platform::MKLDNNDeviceContext::tls().set_key_suffix(
+          "E" + std::to_string(reinterpret_cast<uintptr_t>(ptr)));
+    }
+    // Let's register adress of current executor
+    paddle::platform::MKLDNNDeviceContext::tls().set_curr_exec(ptr);
+
+    // For first thread
+    if (first_thread == ThreadIDasStr()) {
+      paddle::platform::MKLDNNDeviceContext::tls().disable_tid_in_key();
+    }
   }
 }
 
@@ -457,13 +470,14 @@ inline std::string CreateKey(const platform::MKLDNNDeviceContext& dev_ctx,
   key.reserve(64);
   using expand_type = int[];
   expand_type{0, (AppendKey(&key, std::forward<ArgTypes>(args)), 0)...};
-  key += dev_ctx.GetKeySuffix();
+  key += paddle::platform::MKLDNNDeviceContext::tls().get_key_suffix();
   return key;
 }
 
 inline std::string ExtendKeyWithThreadInfoIfNeeded(
     const platform::MKLDNNDeviceContext& dev_ctx, const std::string& key) {
-  return ((dev_ctx.IsThreadIdUsedInKey() == true) &&
+  return ((paddle::platform::MKLDNNDeviceContext::tls().is_tid_used_in_key() ==
+           true) &&
           (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() ==
            platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default))
              ? key + "-t:" + ThreadIDasStr()
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 0503c3f71a80238bbbf2241905a4f69b9f2169f9..e584b849368e41ac12a604ff8924bcb620874b31 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -87,6 +87,11 @@ class MKLDNNHandlerT {
                                             "@dst_mem_p");
   }
 
+  template <typename T_out = T>
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(void) {
+    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p");
+  }
+
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(
       const framework::Tensor* output) {
@@ -561,7 +566,10 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
 
       const auto src_x_tz = framework::vectorize(x->dims());
       const auto src_y_tz = framework::vectorize(y->dims());
-      const auto dst_tz = framework::vectorize(z->dims());
+      // if output tensor(z) is nullptr then we are computing into oneDNN
+      // managed buffer
+      const auto dst_tz =
+          (z == nullptr) ? src_x_tz : framework::vectorize(z->dims());
 
       const auto src0_md = dnnl::memory::desc(
           src_x_tz, platform::MKLDNNGetDataType<T>(), x->format());
@@ -622,6 +630,67 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
   }
 };
 
+template <typename T>
+class BroadcastDataMKLDNNHandler
+    : public platform::MKLDNNHandlerT<T, dnnl::binary> {
+ public:
+  BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
+                             const MKLDNNDeviceContext& dev_ctx,
+                             const mkldnn::engine engine,
+                             platform::Place cpu_place, const Tensor* x,
+                             const Tensor* y, float scale_x, float scale_y,
+                             const std::string& uniq_name,
+                             const std::vector<int64_t>& input_dims)
+      : platform::MKLDNNHandlerT<T, dnnl::binary>(
+            dev_ctx, engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
+                                uniq_name)) {
+    if (!this->isCached()) {
+      PADDLE_ENFORCE_EQ(
+          x->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument("Wrong layout set for X tensor."));
+      PADDLE_ENFORCE_NE(
+          x->format(), MKLDNNMemoryFormat::undef,
+          platform::errors::InvalidArgument("Wrong format set for X tensor."));
+
+      PADDLE_ENFORCE_EQ(
+          y->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument("Wrong layout set for Y tensor."));
+      PADDLE_ENFORCE_NE(
+          y->format(), MKLDNNMemoryFormat::undef,
+          platform::errors::InvalidArgument("Wrong format set for Y tensor."));
+
+      const auto src0_tz = framework::vectorize(x->dims());
+
+      const auto src0_md = dnnl::memory::desc(
+          src0_tz, platform::MKLDNNGetDataType<T>(), x->format());
+      const auto src1_md = dnnl::memory::desc(
+          input_dims, platform::MKLDNNGetDataType<T>(), x->format());
+
+      dnnl::primitive_attr attributes;
+      attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
+      attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y});
+
+      this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md,
+                                              src1_md, src0_md);
+    }
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(framework::Tensor* input) {
+    T* input_data = input->data<T>();
+    memset(input_data, 0, this->fwd_pd_->src_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src0_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSecondSrcMemory(
+      const framework::Tensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->src1_desc(), to_void_cast<T>(input_data), "@src1_mem_p");
+  }
+};
+
 template <typename T>
 class ReductionMKLDNNHandler
     : public platform::MKLDNNHandlerT<T, dnnl::reduction> {
@@ -630,7 +699,8 @@ class ReductionMKLDNNHandler
                          const float eps, const MKLDNNDeviceContext& dev_ctx,
                          const mkldnn::engine engine, platform::Place cpu_place,
                          const Tensor* x, const Tensor* y,
-                         const std::string& uniq_name)
+                         const std::string& uniq_name,
+                         std::vector<int64_t> y_tz)
       : platform::MKLDNNHandlerT<T, dnnl::reduction>(
             dev_ctx, engine, cpu_place,
             platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
@@ -644,23 +714,14 @@ class ReductionMKLDNNHandler
           x->format(), MKLDNNMemoryFormat::undef,
           platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
-      const auto src_tz = framework::vectorize(x->dims());
-      const auto dst_tz = framework::vectorize(y->dims());
-
-      // For oneDNN dimensionality should match so we need to
-      // extend Y tensor dims with values of 1 (before and after pattern)
-      int j = 0;
-      std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
-      for (size_t i = 0; i < src_tz.size(); ++i) {
-        dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
-      }
+      const auto x_tz = framework::vectorize(x->dims());
 
-      const auto src_md = dnnl::memory::desc(
-          src_tz, platform::MKLDNNGetDataType<T>(), x->format());
-      const auto dst_md = memory::desc(
-          dst_tz_ex, platform::MKLDNNGetDataType<T>(), x->format());
+      const auto x_md = dnnl::memory::desc(
+          x_tz, platform::MKLDNNGetDataType<T>(), x->format());
+      const auto y_md =
+          memory::desc(y_tz, platform::MKLDNNGetDataType<T>(), x->format());
 
-      this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps);
+      this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps);
     }
   }
 };
diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc
index 76554012bf51e34fc99db7759404f0e8d6f96cd6..1b44cb196547c2d26cdd5ae72c3331022f834657 100644
--- a/paddle/fluid/platform/monitor.cc
+++ b/paddle/fluid/platform/monitor.cc
@@ -35,3 +35,13 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size)
 DEFINE_INT_STATUS(STAT_gpu13_mem_size)
 DEFINE_INT_STATUS(STAT_gpu14_mem_size)
 DEFINE_INT_STATUS(STAT_gpu15_mem_size)
+
+// For Ascend NPU
+DEFINE_INT_STATUS(STAT_npu0_mem_size)
+DEFINE_INT_STATUS(STAT_npu1_mem_size)
+DEFINE_INT_STATUS(STAT_npu2_mem_size)
+DEFINE_INT_STATUS(STAT_npu3_mem_size)
+DEFINE_INT_STATUS(STAT_npu4_mem_size)
+DEFINE_INT_STATUS(STAT_npu5_mem_size)
+DEFINE_INT_STATUS(STAT_npu6_mem_size)
+DEFINE_INT_STATUS(STAT_npu7_mem_size)
diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h
index b57fae9daac41f37829309c4bc5f58fb2606ca02..0eb9448ce0fad4e1caadb3e08140417294d5d0e7 100644
--- a/paddle/fluid/platform/monitor.h
+++ b/paddle/fluid/platform/monitor.h
@@ -187,3 +187,13 @@ class StatRegistry {
   USE_INT_STAT(STAT_gpu13_mem_size); \
   USE_INT_STAT(STAT_gpu14_mem_size); \
   USE_INT_STAT(STAT_gpu15_mem_size)
+
+#define USE_NPU_MEM_STAT            \
+  USE_INT_STAT(STAT_npu0_mem_size); \
+  USE_INT_STAT(STAT_npu1_mem_size); \
+  USE_INT_STAT(STAT_npu2_mem_size); \
+  USE_INT_STAT(STAT_npu3_mem_size); \
+  USE_INT_STAT(STAT_npu4_mem_size); \
+  USE_INT_STAT(STAT_npu5_mem_size); \
+  USE_INT_STAT(STAT_npu6_mem_size); \
+  USE_INT_STAT(STAT_npu7_mem_size)
diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb36eedb83238149cc38bdfc8e2033834140c7d1
--- /dev/null
+++ b/paddle/fluid/platform/npu_info.cc
@@ -0,0 +1,414 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/npu_info.h"
+#include <algorithm>
+#include <cstdlib>
+#include <memory>
+
+#include "gflags/gflags.h"
+
+#include "paddle/fluid/platform/lock_guard_ptr.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/monitor.h"
+#include "paddle/fluid/string/split.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_uint64(initial_gpu_memory_in_mb);
+DECLARE_uint64(reallocate_gpu_memory_in_mb);
+DECLARE_bool(enable_cublas_tensor_op_math);
+DECLARE_uint64(gpu_memory_limit_mb);
+DECLARE_string(selected_npus);
+
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+USE_NPU_MEM_STAT;
+
+namespace paddle {
+namespace platform {
+
+static int GetNPUDeviceCountImpl() {
+  uint32_t count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDeviceCount(&count));
+  return count;
+}
+
+int GetNPUDeviceCount() {
+  static auto dev_cnt = GetNPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int NPUCanAccessPeer(int src, int dst) {
+  int can = 0;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtDeviceCanAccessPeer(&can, src, dst));
+  return can;
+}
+
+// For example, "1.0.1"
+std::string GetNPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  int major = 0, minor = 0, patch = 0;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetVersion(&major, &minor, &patch));
+  return string::Sprintf("%d.%d.%d", major, minor, patch);
+}
+
+int GetCurrentNPUDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDevice(&device_id));
+  return device_id;
+}
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedNPUDevices() {
+  // use user specified NPUs in single-node multi-process mode.
+  std::vector<int> devices;
+  if (!FLAGS_selected_npus.empty()) {
+    auto devices_str = paddle::string::Split(FLAGS_selected_npus, ',');
+    for (auto id : devices_str) {
+      devices.push_back(atoi(id.c_str()));
+    }
+  } else {
+    int count = GetNPUDeviceCount();
+    for (int i = 0; i < count; ++i) {
+      devices.push_back(i);
+    }
+  }
+  return devices;
+}
+
+void SetNPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  // NOTE(zihqiu): It is recommended to call aclrtSetDevice and aclrtResetDevice
+  // pairly.
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(id));
+}
+
+void ResetNPUDeviceId(int id) {
+  PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(),
+                    platform::errors::InvalidArgument(
+                        "Device id must be less than NPU count, "
+                        "but received id is: %d. NPU count is: %d.",
+                        id, GetNPUDeviceCount()));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtResetDevice(id));
+}
+
+void NPUMemoryUsage(size_t *available, size_t *total) {
+  size_t actual_available, actual_total;
+  RecordedNPUMemGetInfo(available, total, &actual_available, &actual_total,
+                        platform::GetCurrentNPUDeviceId());
+}
+
+size_t NPUAvailableMemToAlloc() {
+  size_t total = 0;
+  size_t available = 0;
+  NPUMemoryUsage(&available, &total);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = NPUMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  VLOG(10) << "NPU usage " << (available >> 20) << "M/" << (total >> 20)
+           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
+  return available_to_alloc;
+}
+
+size_t NPUMaxAllocSize() {
+  return std::max(NPUInitAllocSize(), NPUReallocSize());
+}
+
+static size_t NPUAllocSize(bool realloc) {
+  size_t available_to_alloc = NPUAvailableMemToAlloc();
+  PADDLE_ENFORCE_GT(
+      available_to_alloc, 0,
+      platform::errors::ResourceExhausted("Not enough available NPU memory."));
+  // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
+  // allocated by fraction
+  size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
+                           : FLAGS_initial_gpu_memory_in_mb;
+  size_t alloc_bytes =
+      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
+                                           FLAGS_fraction_of_gpu_memory_to_use);
+  PADDLE_ENFORCE_GE(
+      available_to_alloc, alloc_bytes,
+      platform::errors::ResourceExhausted("Not enough available NPU memory."));
+  VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
+           << " MiB, is it Re-alloc: " << realloc;
+  return alloc_bytes;
+}
+
+size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); }
+
+size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); }
+
+size_t NPUMinChunkSize() {
+  // NOTE(zhiqiu): It seems the min chunk size should be 512 on NPU,
+  // though no document specify that explicitly.
+  // See https://gitee.com/zhiqiuchen/Ascend/tree/master/test_reduce_sum_d for
+  // details.
+  return 1 << 9;
+}
+
+size_t NPUMaxChunkSize() {
+  size_t max_chunk_size = NPUMaxAllocSize();
+  VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
+  return max_chunk_size;
+}
+
+void NPUMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum aclrtMemcpyKind kind, aclrtStream stream,
+                    size_t dst_max_count) {
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
+          << kind << " " << stream;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
+}
+
+void NPUMemcpySync(void *dst, const void *src, size_t count,
+                   enum aclrtMemcpyKind kind, size_t dst_max_count) {
+  // NOTE(zhiqiu):  The default max_count is count
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
+          << kind;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
+}
+
+void NPUMemcpyPeerASync(void *dst, int dst_device, const void *src,
+                        size_t count, enum aclrtMemcpyKind kind,
+                        aclrtStream stream, size_t dst_max_count) {
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream));
+}
+
+void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, size_t count,
+                       enum aclrtMemcpyKind kind, size_t dst_max_count) {
+  // NOTE(zhiqiu):  The default max_count is count
+  dst_max_count = dst_max_count ? dst_max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
+}
+
+void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
+                    size_t max_count) {
+  max_count = max_count ? max_count : count;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemsetAsync(dst, max_count, value, count, stream));
+}
+
+void NPUStreamSync(aclrtStream stream) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
+}
+
+static void RaiseNonOutOfMemoryError(aclError *status) {
+  if (*status == ACL_ERROR_BAD_ALLOC) {
+    *status = ACL_ERROR_NONE;
+  }
+  PADDLE_ENFORCE_NPU_SUCCESS(*status);
+}
+
+class RecordedNPUMallocHelper {
+ private:
+  explicit RecordedNPUMallocHelper(int dev_id, uint64_t limit_size = 0)
+      : dev_id_(dev_id), limit_size_(limit_size) {
+    if (NeedRecord()) {
+      mtx_.reset(new std::mutex());
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(RecordedNPUMallocHelper);
+
+ public:
+  static RecordedNPUMallocHelper *Instance(int dev_id) {
+    std::call_once(once_flag_, [] {
+      int dev_cnt = GetNPUDeviceCount();
+      instances_.reserve(dev_cnt);
+      for (int i = 0; i < dev_cnt; ++i) {
+        // NOTE(zhiqiu): share the flags with gpu, avoid more flags.
+        instances_.emplace_back(
+            new RecordedNPUMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20));
+      }
+    });
+
+    PADDLE_ENFORCE_GE(
+        dev_id, 0,
+        platform::errors::OutOfRange(
+            "Device id must be not less than 0, but got %d.", dev_id));
+    PADDLE_ENFORCE_LT(
+        dev_id, instances_.size(),
+        platform::errors::OutOfRange("Device id %d exceeds npu card number %d.",
+                                     dev_id, instances_.size()));
+    return instances_[dev_id].get();
+  }
+
+  /**
+   * Try to allocate `size` npu memory. Only ACL_ERROR_BAD_ALLOC
+   * or ACL_ERROR_NONE would be returned.
+   */
+  aclError Malloc(void **ptr, size_t size) {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) {
+      return ACL_ERROR_BAD_ALLOC;
+    }
+
+    NPUDeviceGuard guard(dev_id_);
+    auto result = aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
+    if (result == ACL_ERROR_NONE) {
+      if (NeedRecord()) {
+        cur_size_ += size;
+      }
+      STAT_INT_ADD("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
+      return result;
+    } else {
+      RaiseNonOutOfMemoryError(&result);
+      // Non out of memory error would be raised inside
+      // RaiseNonOutOfMemoryError. Therefore, we can
+      // return cudaErrorMemoryAllocation directly here.
+      return ACL_ERROR_BAD_ALLOC;
+    }
+  }
+
+  /**
+   * Free gpu memory. Usually, free is not allowed to raise error.
+   * If it does raise error, the process should be crashed.
+   */
+  void Free(void *ptr, size_t size) {
+    NPUDeviceGuard guard(dev_id_);
+    auto result = aclrtFree(ptr);
+    PADDLE_ENFORCE_NPU_SUCCESS(result);
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      cur_size_ -= size;
+    }
+    STAT_INT_SUB("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size);
+  }
+
+  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                  size_t *actual_total) {
+    {
+      NPUDeviceGuard guard(dev_id_);
+      auto result = aclrtGetMemInfo(ACL_HBM_MEM, actual_avail, actual_total);
+      if (result != ACL_ERROR_NONE) {
+        *actual_avail = 0;
+      }
+      RaiseNonOutOfMemoryError(&result);
+    }
+
+    if (NeedRecord()) {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      *avail = std::min(*actual_avail, limit_size_ - cur_size_);
+      *total = std::min(*actual_total, limit_size_);
+      return *total < *actual_total;
+    } else {
+      *avail = *actual_avail;
+      *total = *actual_total;
+      return false;
+    }
+  }
+
+  inline bool NeedRecord() const { return limit_size_ != 0; }
+
+  uint64_t RecordedSize() const {
+    LockGuardPtr<std::mutex> lock(mtx_);
+    return NeedRecord() ? cur_size_ : 0;
+  }
+
+  uint64_t LimitSize() const { return limit_size_; }
+
+ private:
+  const int dev_id_;
+  const uint64_t limit_size_;
+  uint64_t cur_size_{0};
+
+  mutable std::unique_ptr<std::mutex> mtx_;
+
+  static std::once_flag once_flag_;
+  static std::vector<std::unique_ptr<RecordedNPUMallocHelper>> instances_;
+};
+
+std::once_flag RecordedNPUMallocHelper::once_flag_;
+std::vector<std::unique_ptr<RecordedNPUMallocHelper>>
+    RecordedNPUMallocHelper::instances_;
+
+aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->Malloc(ptr, size);
+}
+
+void RecordedNPUFree(void *p, size_t size, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->Free(p, size);
+}
+
+bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->GetMemInfo(
+      avail, total, actual_avail, actual_total);
+}
+
+uint64_t RecordedNPUMallocSize(int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->RecordedSize();
+}
+
+bool IsNPUMallocRecorded(int dev_id) {
+  return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord();
+}
+
+AclInstance::~AclInstance() {}
+
+AclInstance &AclInstance::Instance() {
+  static AclInstance instance;
+  return instance;
+}
+
+AclInstance::AclInstance() {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr));
+  VLOG(4) << "Call aclrtSetDevice ";
+  // NOTE(zhiqiu): why set devices here?
+  // Because ACL creates a default context which contains 2 streams
+  // when calling aclrtSetDeviceId, so usually we do not need to
+  // create contexts explicitly. And, for each device, aclrtSetDeviceId
+  // need to call parily with aclrtResetDeviceId to destory the default
+  // context. Here, we use this singleton and static instance to manage
+  // the devices to make sure they will be resetted before program exit.
+  devices_ = platform::GetSelectedNPUDevices();
+  for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) {
+    SetNPUDeviceId(*it);
+    VLOG(4) << "Call aclrtSetDevice " << *it;
+  }
+}
+
+void AclInstance::Finalize() {
+  // NOTE(zhiqiu): DO NOT perform finalize in destructor
+  // to avoid problems caused by destructor order of static
+  // object.
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    auto status = aclrtResetDevice(devices_[i]);
+    VLOG(4) << "Call aclrtResetDevice " << devices_[i]
+            << " status = " << status;
+  }
+  auto status = aclFinalize();
+  VLOG(4) << "Call aclFinalize, status = " << status;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/npu_info.h b/paddle/fluid/platform/npu_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f392f1a534864caf0c10d02be4b7eaf46913a92
--- /dev/null
+++ b/paddle/fluid/platform/npu_info.h
@@ -0,0 +1,156 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <stddef.h>
+
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of NPU devices in system.
+int GetNPUDeviceCount();
+
+//! Get the runtime version of the ith NPU
+std::string GetNPURuntimeVersion(int id);
+//! Check if this device can access peer or not.
+int NPUCanAccessPeer(int src, int dst);
+
+//! Get the current NPU device id in system.
+int GetCurrentNPUDeviceId();
+
+//! Get the current NPU stream.
+int GetCurrentStream();
+
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedNPUDevices();
+
+//! Set the NPU device id for next execution.
+void SetNPUDeviceId(int device_id);
+
+//! Reset the NPU device id for next execution.
+void ResetNPUDeviceId(int device_id);
+
+//! Get the memory usage of current NPU device.
+void NPUMemoryUsage(size_t *available, size_t *total);
+
+//! Get the available memory to allocate, which is the size of available npu
+//! minus reserving.
+size_t NPUAvailableMemToAlloc();
+
+//! Get the maximum allocation size of current NPU device.
+size_t NPUMaxAllocSize();
+
+//! Get the initial allocation size of current NPU device.
+size_t NPUInitAllocSize();
+
+//! Get the re-allocation size of current NPU device.
+size_t NPUReallocSize();
+
+//! Get the minimum chunk size for NPU buddy allocator.
+size_t NPUMinChunkSize();
+
+//! Get the maximum chunk size for NPU buddy allocator.
+size_t NPUMaxChunkSize();
+
+//! Copy memory from address src to dst asynchronously.
+void NPUMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum aclrtMemcpyKind kind, aclrtStream stream,
+                    size_t dst_max_count = 0);
+
+//! Copy memory from address src to dst synchronously.
+void NPUMemcpySync(void *dst, const void *src, size_t count,
+                   enum aclrtMemcpyKind kind, size_t dst_max_count = 0);
+
+//! Set memory dst with value count size asynchronously
+void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream,
+                    size_t max_count = 0);
+
+//! Copy memory from one device to another device asynchronously.
+void NPUMemcpyPeerAsync(void *dst, int dst_device, const void *src,
+                        int src_device, size_t count, aclrtStream stream,
+                        size_t max_count = 0);
+
+//! Copy memory from one device to another device synchronously.
+void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src,
+                       int src_device, size_t count, size_t max_count = 0);
+
+//! Blocks until stream has completed all operations.
+void NPUStreamSync(aclrtStream stream);
+
+//! aclrtMalloc with recorded info
+aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id);
+
+//! aclrtFree with recorded info
+void RecordedNPUFree(void *p, size_t size, int dev_id);
+
+//! Get available and total gpu memory with considering limitation
+bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
+                           size_t *actual_total, int dev_id);
+
+//! Get recorded actrtMalloc size. If record is disabled, return 0.
+uint64_t RecordedNPUMallocSize(int dev_id);
+
+bool IsNPUMallocRecorded(int dev_id);
+
+class NPUDeviceGuard {
+ public:
+  explicit inline NPUDeviceGuard(int dev_id) {
+    int prev_id = platform::GetCurrentNPUDeviceId();
+    if (prev_id != dev_id) {
+      prev_id_ = prev_id;
+      platform::SetNPUDeviceId(dev_id);
+    }
+  }
+
+  inline ~NPUDeviceGuard() {
+    if (prev_id_ != -1) {
+      platform::SetNPUDeviceId(prev_id_);
+    }
+  }
+
+  NPUDeviceGuard(const NPUDeviceGuard &o) = delete;
+  NPUDeviceGuard &operator=(const NPUDeviceGuard &o) = delete;
+
+ private:
+  int prev_id_{-1};
+};
+
+class AclInstance {
+ public:
+  // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so
+  // no PADDLE_ENFORCE here, call acl API directly.
+  ~AclInstance();
+  AclInstance(const AclInstance &o) = delete;
+  const AclInstance &operator=(const AclInstance &o) = delete;
+  static AclInstance &Instance();
+  void Finalize();
+
+ private:
+  // forbid calling default constructor
+  AclInstance();
+  std::vector<int> devices_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/npu_profiler.h b/paddle/fluid/platform/npu_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7b674d0d0c3fec3771698db6b3a1080b1ebe15e
--- /dev/null
+++ b/paddle/fluid/platform/npu_profiler.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "acl/acl_prof.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+#ifdef PADDLE_WITH_ASCEND_STRING
+// For CANN 20.2+
+// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats
+// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline
+// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory
+// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory
+// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio
+constexpr aclprofAicoreMetrics default_metrics =
+    ACL_AICORE_ARITHMETIC_UTILIZATION;
+#else
+// For CANN 20.1
+// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
+// ACL_AICORE_PIPELINE = 1, record pipeline
+// ACL_AICORE_SYNCHRONIZATION = 2, record sync
+// ACL_AICORE_MEMORY = 3, recore memory
+// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory
+// ACL_AICORE_STALL = 5, record pipeline ratio
+constexpr aclprofAicoreMetrics default_metrics =
+    ACL_AICORE_ARITHMATIC_THROUGHPUT;
+#endif
+
+// ACL_PROF_ACL_API, record ACL API stats
+// ACL_PROF_TASK_TIME, record AI core stats
+// ACL_PROF_AICORE_METRICS, must include
+// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet
+constexpr uint64_t default_type =
+    ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME;
+
+aclprofConfig *NPUProfilerCreateConfig(
+    std::vector<uint32_t> devices = {},
+    aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type,
+    aclprofAicoreEvents *events = nullptr) {
+  if (devices.size() == 0) {
+    int device_id = GetCurrentNPUDeviceId();
+    devices.emplace_back(device_id);
+  }
+  aclprofConfig *config =
+      aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c);
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External(
+                                      "Failed to create prof config for NPU"));
+  return config;
+}
+
+void NPUProfilerDestroyConfig(const aclprofConfig *config) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config));
+}
+
+void NPUProfilerInit(std::string output_path) {
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclprofInit(output_path.c_str(), output_path.size()));
+}
+
+void NPUProfilerStart(const aclprofConfig *config) {
+  if (config == nullptr) {
+    // NOTE(zhiqiu): support single device by default.
+    int device_id = GetCurrentNPUDeviceId();
+    std::vector<uint32_t> devices = {static_cast<uint32_t>(device_id)};
+    config = NPUProfilerCreateConfig(devices);
+  }
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config));
+}
+
+void NPUProfilerStop(const aclprofConfig *config) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config));
+  NPUProfilerDestroyConfig(config);
+}
+
+void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); }
+
+struct NPUProfConfigWrapper {
+  aclprofConfig *p_;
+  explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {}
+  aclprofConfig *ptr() { return p_; }
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/npu_resource_pool.cc b/paddle/fluid/platform/npu_resource_pool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22b9e8f03971e53bd307b0d699594ec8a989cc70
--- /dev/null
+++ b/paddle/fluid/platform/npu_resource_pool.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_resource_pool.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace platform {
+
+NpuStreamResourcePool::NpuStreamResourcePool() {
+  int dev_cnt = platform::GetNPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetNPUDeviceId(dev_idx);
+      aclrtStream stream;
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream));
+      return stream;
+    };
+
+    auto deleter = [dev_idx](aclrtStream stream) {
+      platform::SetNPUDeviceId(dev_idx);
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream));
+    };
+
+    pool_.emplace_back(ResourcePool<NpuStreamObject>::Create(creator, deleter));
+  }
+}
+
+NpuStreamResourcePool& NpuStreamResourcePool::Instance() {
+  static NpuStreamResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<NpuStreamObject> NpuStreamResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+NpuEventResourcePool::NpuEventResourcePool() {
+  int dev_cnt = platform::GetNPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetNPUDeviceId(dev_idx);
+      aclrtEvent event;
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
+      return event;
+    };
+
+    auto deleter = [dev_idx](aclrtEvent event) {
+      platform::SetNPUDeviceId(dev_idx);
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+    };
+
+    pool_.emplace_back(ResourcePool<NpuEventObject>::Create(creator, deleter));
+  }
+}
+
+NpuEventResourcePool& NpuEventResourcePool::Instance() {
+  static NpuEventResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<NpuEventObject> NpuEventResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/npu_resource_pool.h b/paddle/fluid/platform/npu_resource_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfd6ec7f9411204e9c1ed96778e6220ab7a7aebe
--- /dev/null
+++ b/paddle/fluid/platform/npu_resource_pool.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "acl/acl.h"
+#include "paddle/fluid/platform/resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+using NpuStreamObject = std::remove_pointer<aclrtStream>::type;
+using NpuEventObject = std::remove_pointer<aclrtEvent>::type;
+
+class NpuStreamResourcePool {
+ public:
+  std::shared_ptr<NpuStreamObject> New(int dev_idx);
+
+  static NpuStreamResourcePool &Instance();
+
+ private:
+  NpuStreamResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(NpuStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<NpuStreamObject>>> pool_;
+};
+
+class NpuEventResourcePool {
+ public:
+  std::shared_ptr<NpuEventObject> New(int dev_idx);
+
+  static NpuEventResourcePool &Instance();
+
+ private:
+  NpuEventResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(NpuEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<NpuEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc
index b80d2fd1632cd82c231fae724fc4d754b8fed0fc..1cc9fd9fe76341cd495a3580cddbff65f5b0e208 100644
--- a/paddle/fluid/platform/place.cc
+++ b/paddle/fluid/platform/place.cc
@@ -33,6 +33,7 @@ class PlacePrinter : public boost::static_visitor<> {
     os_ << "CUDAPlace(" << p.device << ")";
   }
   void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
+  void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
   void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
 
  private:
@@ -49,6 +50,10 @@ bool is_xpu_place(const Place &p) {
   return boost::apply_visitor(IsXPUPlace(), p);
 }
 
+bool is_npu_place(const Place &p) {
+  return boost::apply_visitor(IsNPUPlace(), p);
+}
+
 bool is_cpu_place(const Place &p) {
   return boost::apply_visitor(IsCPUPlace(), p);
 }
@@ -67,6 +72,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
       return true;
     } else if (is_xpu_place(p1)) {
       return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
+    } else if (is_npu_place(p1)) {
+      return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
     } else {
       return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
     }
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index e11ca4159e07e927b11cf1e0c3f6c638b71c4c84..f20fac477d0ec4ef40a3544476e223b6ad97fffa 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -72,16 +72,31 @@ struct XPUPlace {
   int device;
 };
 
+struct NPUPlace {
+  NPUPlace() : NPUPlace(0) {}
+  explicit NPUPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const NPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const NPUPlace &o) const { return !(*this == o); }
+  inline bool operator<(const NPUPlace &o) const { return device < o.device; }
+
+  int device;
+};
+
 struct IsCUDAPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
-  bool operator()(const CUDAPlace &gpu) const { return true; }
+  bool operator()(const NPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &) const { return true; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
 struct IsCPUPlace : public boost::static_visitor<bool> {
-  bool operator()(const CPUPlace &cpu) const { return true; }
+  bool operator()(const CPUPlace &) const { return true; }
   bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
@@ -89,27 +104,38 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
 struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
   bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
 };
 
 struct IsXPUPlace : public boost::static_visitor<bool> {
   bool operator()(const CPUPlace &) const { return false; }
-  bool operator()(const XPUPlace &xpu) const { return true; }
+  bool operator()(const XPUPlace &) const { return true; }
+  bool operator()(const NPUPlace &) const { return false; }
   bool operator()(const CUDAPlace &) const { return false; }
   bool operator()(const CUDAPinnedPlace &) const { return false; }
 };
 
-class Place
-    : public boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace> {
+struct IsNPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const XPUPlace &) const { return false; }
+  bool operator()(const NPUPlace &) const { return true; }
+  bool operator()(const CUDAPlace &) const { return false; }
+  bool operator()(const CUDAPinnedPlace &) const { return false; }
+};
+
+class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
+                                    CUDAPinnedPlace> {
  private:
   using PlaceBase =
-      boost::variant<CUDAPlace, XPUPlace, CPUPlace, CUDAPinnedPlace>;
+      boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace, CUDAPinnedPlace>;
 
  public:
   Place() = default;
   Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {}     // NOLINT
   Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {}     // NOLINT
+  Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {}     // NOLINT
   Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {}  // NOLINT
   Place(const CUDAPinnedPlace &cuda_pinned_place)                // NOLINT
       : PlaceBase(cuda_pinned_place) {}
@@ -126,6 +152,7 @@ using PlaceList = std::vector<Place>;
 
 bool is_gpu_place(const Place &);
 bool is_xpu_place(const Place &);
+bool is_npu_place(const Place &);
 bool is_cpu_place(const Place &);
 bool is_cuda_pinned_place(const Place &);
 bool places_are_same_class(const Place &, const Place &);
@@ -153,6 +180,16 @@ struct PlaceVisitorWrapper
 #endif
   }
 
+  typename Visitor::result_type operator()(const NPUPlace &npu) const {
+#ifdef PADDLE_WITH_ASCEND
+    return visitor_(npu);
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Paddle is not compiled with NPU. Cannot visit npu device"));
+    return typename Visitor::result_type();
+#endif
+  }
+
   typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     return visitor_(cuda);
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
index cfa3c6906f83f750c8d6dc654f29b8fe95ec17ac..31193534a00be03ed96c5ba01666614389830f71 100644
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -21,6 +21,7 @@ message Event {
   enum EventType {
     CPU = 0;
     GPUKernel = 1;
+    NPUKernel = 2;
   }
   optional EventType type = 8;
   optional string name = 1;
@@ -39,6 +40,8 @@ message MemEvent {
     CUDAPlace = 0;
     CPUPlace = 1;
     CUDAPinnedPlace = 2;
+    XPUPlace = 3;
+    NPUPlace = 4;
   }
   optional uint64 start_ns = 1;
   optional uint64 end_ns = 2;
diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt
index c0595eb415da6c4c6c543dd713100714dca19dc2..e1e3e49ce9cbc04298cdb47e29e060ebffc88ba1 100644
--- a/paddle/fluid/platform/stream/CMakeLists.txt
+++ b/paddle/fluid/platform/stream/CMakeLists.txt
@@ -1,3 +1,7 @@
 IF(WITH_GPU OR WITH_ROCM)
 cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost)
 ENDIF()
+
+IF(WITH_ASCEND_CL)
+cc_library(npu_stream SRCS npu_stream.cc DEPS enforce boost stream_callback_manager)
+ENDIF()
diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc
index fc51a08c2aa24873de58929031dc44f18ac4509f..6c6a47fadb5f40c6dbb54ea173b80b55d08ba8fc 100644
--- a/paddle/fluid/platform/stream/cuda_stream.cc
+++ b/paddle/fluid/platform/stream/cuda_stream.cc
@@ -49,8 +49,8 @@ bool CUDAStream::Init(const Place& place, const Priority& priority) {
         cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0));
 #endif
   }
-  callback_manager_.reset(new StreamCallbackManager(stream_));
-  VLOG(3) << "CUDAStream Init stream: " << stream_
+  callback_manager_.reset(new StreamCallbackManager<gpuStream_t>(stream_));
+  VLOG(3) << "GPUStream Init stream: " << stream_
           << ", priority: " << static_cast<int>(priority);
   return true;
 }
diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h
index d9375492519d8c26c487326e5325efa0ea961de0..46bbe94b080f965aed5ba08423512777f84e3ec0 100644
--- a/paddle/fluid/platform/stream/cuda_stream.h
+++ b/paddle/fluid/platform/stream/cuda_stream.h
@@ -101,7 +101,7 @@ class CUDAStream final {
   cudaStream_t stream_{nullptr};
 #endif
   Priority priority_{Priority::kNormal};
-  std::unique_ptr<StreamCallbackManager> callback_manager_;
+  std::unique_ptr<StreamCallbackManager<gpuStream_t>> callback_manager_;
 
   DISABLE_COPY_AND_ASSIGN(CUDAStream);
 };
diff --git a/paddle/fluid/platform/stream/npu_stream.cc b/paddle/fluid/platform/stream/npu_stream.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c3e153e58cda7cbe4278b349be7e40725cc70db
--- /dev/null
+++ b/paddle/fluid/platform/stream/npu_stream.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/stream/npu_stream.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+bool NPUStream::Init(const Place& place) {
+  PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                    platform::errors::InvalidArgument(
+                        "NPU stream must be created using npu place."));
+  place_ = place;
+  NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device);
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream_));
+  callback_manager_.reset(new StreamCallbackManager<aclrtStream>(stream_));
+  VLOG(3) << "NPUStream Init stream: " << stream_;
+  return true;
+}
+
+void NPUStream::Destroy() {
+  NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device);
+  Wait();
+  WaitCallback();
+  if (stream_) {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream_));
+  }
+  stream_ = nullptr;
+}
+
+void NPUStream::Wait() const {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_));
+}
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/stream/npu_stream.h b/paddle/fluid/platform/stream/npu_stream.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e5d574acecf54f8ecf2476db7bbe177f34a9196
--- /dev/null
+++ b/paddle/fluid/platform/stream/npu_stream.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <memory>
+
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream_callback_manager.h"
+
+namespace paddle {
+namespace platform {
+namespace stream {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class NPUStream final {
+ public:
+  NPUStream() = default;
+  explicit NPUStream(const Place& place) { Init(place); }
+  virtual ~NPUStream() { Destroy(); }
+
+  bool Init(const Place& place);
+
+  template <typename Callback>
+  void AddCallback(Callback&& callback) const {
+    callback_manager_->AddCallback(callback);
+  }
+
+  template <typename Callback>
+  void RecordEvent(aclrtEvent ev, Callback callback) const {
+    callback();
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(ev, stream_));
+  }
+
+  void RecordEvent(aclrtEvent ev) const {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(ev, stream_));
+  }
+
+  void WaitEvent(aclrtEvent ev) const {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream_, ev));
+  }
+
+  void Wait() const;
+  void WaitCallback() const { callback_manager_->Wait(); }
+
+  aclrtStream raw_stream() const { return stream_; }
+  void Destroy();
+
+ private:
+  Place place_;
+  aclrtStream stream_{nullptr};
+  std::unique_ptr<StreamCallbackManager<aclrtStream>> callback_manager_;
+
+  DISABLE_COPY_AND_ASSIGN(NPUStream);
+};
+
+#endif
+
+}  // namespace stream
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index d6b106dc582d51d4f3339dfdac4f782ec1942fa5..9f4ec9b3ce0d449f9e7bd462b9c9b6a0bd02cf92 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -21,11 +21,18 @@ namespace platform {
 #ifdef PADDLE_WITH_HIP
 static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status,
                                void *user_data)
-#elif CUDA_VERSION >= 10000
-static void CUDART_CB StreamCallbackFunc(void *user_data)
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
+    static void CUDART_CB StreamCallbackFunc(void *user_data)
 #else
-static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
-                                         cudaError_t status, void *user_data)
+    static void CUDART_CB
+    StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data)
+#endif
+#endif
+
+#if PADDLE_WITH_ASCEND_CL
+        static void StreamCallbackFunc(void *user_data)
 #endif
 {
   std::unique_ptr<std::function<void()>> func(
@@ -33,10 +40,13 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
   (*func)();
 }
 
-StreamCallbackManager::StreamCallbackManager(const gpuStream_t stream)
+template <typename Stream>
+StreamCallbackManager<Stream>::StreamCallbackManager(const Stream stream)
     : stream_(stream), thread_pool_(1) {}
 
-void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
+template <typename Stream>
+void StreamCallbackManager<Stream>::AddCallback(
+    std::function<void()> callback) const {
   auto *callback_func = new std::function<void()>(std::move(callback));
   auto *func = new std::function<void()>([this, callback_func] {
     std::lock_guard<std::mutex> lock(mtx_);
@@ -45,23 +55,39 @@ void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
       (*callback_func)();
     });
   });
+
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_CUDA_SUCCESS(
       hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
-#elif CUDA_VERSION >= 10000
+#endif
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
   PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
 #else
   PADDLE_ENFORCE_CUDA_SUCCESS(
       cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
+#endif
+
+#if PADDLE_WITH_ASCEND_CL
+  VLOG(3) << "aclrtLaunchCallback at stream: " << stream_;
+  // TODO(zhiqiu): failed to call aclrtLaunchCallback
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtLaunchCallback(StreamCallbackFunc, func,
+                                                 ACL_CALLBACK_BLOCK, stream_));
+#endif
 }
 
-void StreamCallbackManager::Wait() const {
+template <typename Stream>
+void StreamCallbackManager<Stream>::Wait() const {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
-#else
+#endif
+#ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_));
 #endif
   {
     std::lock_guard<std::mutex> lock(mtx_);
@@ -71,5 +97,15 @@ void StreamCallbackManager::Wait() const {
   }
 }
 
+#ifdef PADDLE_WITH_CUDA
+template struct StreamCallbackManager<gpuStream_t>;
+#endif
+#ifdef PADDLE_WITH_HIP
+template struct StreamCallbackManager<hipStream_t>;
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+template struct StreamCallbackManager<aclrtStream>;
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 56e8f83b5a51c1d04a19dc2c0a85a312ad461e3f..1b960f188ec3045d7362c5f4fb850b7ba6a0a85e 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -37,9 +37,10 @@ namespace platform {
 
 // NOTE(zjl): clean StreamCallbackManager to make compilation faster
 // Make StreamCallbackManager thread-safe
+template <typename Stream>
 class StreamCallbackManager {
  public:
-  explicit StreamCallbackManager(const gpuStream_t stream);
+  explicit StreamCallbackManager(const Stream stream);
 
   ~StreamCallbackManager() = default;
 
@@ -48,7 +49,7 @@ class StreamCallbackManager {
   void Wait() const;
 
  private:
-  const gpuStream_t stream_;
+  const Stream stream_;
   mutable ::ThreadPool thread_pool_;
   mutable std::mutex mtx_;
   mutable std::future<void> last_future_;
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 7a63217d678d1e1f84f788daeacc2a0878f90890..49da54080734cf9f49a566c2861678c6c6c73599 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,8 +1,16 @@
+# Adapt to custom op mechanism: Include the header files related to the data type
+# to avoid exposing the path of the underlying file
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)
 
+if (WITH_PSCORE)
+  set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
+  set(PYBIND_DEPS ${PYBIND_DEPS} graph_py_service)
+endif()
 if (WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
@@ -28,6 +36,7 @@ endif(NOT WIN32)
 
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
+  list(APPEND PYBIND_DEPS py_layer_op)
 endif()
 
 set(PYBIND_SRCS
@@ -52,7 +61,7 @@ set(PYBIND_SRCS
 if(WITH_ASCEND)
   set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper)
   set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc)
-endif(WITH_ASCEND)
+endif()
 
 if(WITH_GLOO)
   set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context)
@@ -64,10 +73,18 @@ if (WITH_CRYPTO)
   set(PYBIND_SRCS ${PYBIND_SRCS} crypto.cc)
 endif (WITH_CRYPTO)
 
+if (WITH_PSLIB)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
+  if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+      set(DISTRIBUTE_COMPILE_FLAGS
+              "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
+  endif()
+  set_source_files_properties(heter_wrapper_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+endif(WITH_PSLIB)
 if (WITH_PSCORE)
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result")
   set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-  list(APPEND PYBIND_DEPS fleet communicator)
+  list(APPEND PYBIND_DEPS fleet communicator index_wrapper index_sampler)
   list(APPEND PYBIND_SRCS fleet_py.cc)
 endif()
 
@@ -77,7 +94,11 @@ endif()
 
 if(WITH_PYTHON)
   # generate op pybind functions automatically for dygraph.
-  set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+  if (WITH_ASCEND_CL)
+    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag ascend_wrapper)
+  else()
+    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+  endif()
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
 
@@ -91,6 +112,7 @@ if(WITH_PYTHON)
 
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
+
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(op_function_generator ${os_dependency_modules})
   if(WITH_ROCM)
@@ -101,7 +123,7 @@ if(WITH_PYTHON)
   set(tmp_impl_file ${impl_file}.tmp)
 
   if(WIN32)
-      if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
       set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
     else()
       set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
@@ -113,7 +135,7 @@ if(WITH_PYTHON)
     "${op_function_generator_path}/op_function_generator ${impl_file}\n"
     "if %ERRORLEVEL% NEQ 0 (\n"
     "    set /a build_times=%build_times%+1\n"
-    "    if %build_times% GTR 100 (\n"
+    "    if %build_times% GTR 5 (\n"
     "        exit /b 1\n"
     "    ) else (\n"
     "        goto :retry\n"
@@ -144,9 +166,9 @@ if(WITH_PYTHON)
           )
     endif()
   else(WIN32)
-    # If there are no *.so in /usr/lib or LD_LIBRARY_PATH, 
+    # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
     # copy these *.so to current directory and append current directory to
-    # LD_LIBRARY_PATH. This is different with Windows platformm, which search 
+    # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
     add_custom_command(TARGET op_function_generator
           POST_BUILD
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
index 00eca380859527ccf71f03b0e677702750e049b7..43725f7dc0f73e438834b108f8f65069f96db575 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include <fcntl.h>
 
 #ifdef _POSIX_C_SOURCE
@@ -32,6 +32,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#include "paddle/fluid/platform/ascend_npu_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
 
 using namespace ge;  // NOLINT
@@ -40,6 +42,12 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
+#ifdef PADDLE_WITH_ASCEND_STRING
+using AscendString = AscendString;
+#else
+using AscendString = std::string;
+#endif
+
 void BindAscendWrapper(py::module *m) {
   py::class_<framework::AscendInstance,
              std::shared_ptr<framework::AscendInstance>>(*m, "AscendInstance")
@@ -47,13 +55,31 @@ void BindAscendWrapper(py::module *m) {
       .def("init_global_resources",
            &framework::AscendInstance::InitGlobalResouces,
            py::call_guard<py::gil_scoped_release>())
+      .def("destroy_global_resources",
+           &framework::AscendInstance::DestroyGlobalResouces,
+           py::call_guard<py::gil_scoped_release>())
       .def("add_ascend_subgraph", &framework::AscendInstance::AddAscendSubgraph,
            py::call_guard<py::gil_scoped_release>());
-}  // end AscendWrapper
+}
+
+std::map<AscendString, AscendString> convert_map(
+    const std::map<std::string, std::string> &options) {
+  std::map<AscendString, AscendString> rets;
+  for (auto &option : options) {
+    AscendString key = option.first.c_str();
+    AscendString val = option.second.c_str();
+    rets[key] = val;
+  }
+  return rets;
+}
 
-Status ge_initialize(std::map<std::string, std::string> &options) {  // NOLINT
+ge::Status ge_initialize(
+    std::map<std::string, std::string> &options) {  // NOLINT
   py::gil_scoped_release release;
-  Status res = GEInitialize(options);
+  auto init_options = convert_map(options);
+  ge::Status res = ge::GEInitialize(init_options);
+  PADDLE_ENFORCE_EQ(res, ge::SUCCESS, platform::errors::Fatal(
+                                          "ge initialize not success:%d", res));
   py::gil_scoped_acquire acquire;
   return res;
 }
@@ -82,11 +108,20 @@ enum AttrType {
   AT_NAMEATTR
 };
 
+#ifdef PADDLE_WITH_ASCEND
+void BindAscendDevice(py::module *m) {
+  py::class_<platform::ascend::NPUDevice>(*m, "NPUDevice")
+      .def_static(
+          "get_device_count",
+          static_cast<int (*)()>(&platform::ascend::NPUDevice::GetDeviceCount));
+}
+#endif
+
 void BindAscendGraph(py::module *m) {
   m->def("ge_initialize", &ge_initialize, "GEInitialize");
   m->def("ge_finalize", &GEFinalize, "GEFinalize");
 
-  //枚举封装
+  // enum
   py::enum_<GraphRunMode>(*m, "GEGraphRunMode")
       .value("PREDICTION", GraphRunMode::PREDICTION)
       .value("TRAIN", GraphRunMode::TRAIN)
@@ -214,24 +249,34 @@ void BindAscendGraph(py::module *m) {
 
   // 类封装
   py::class_<Session>(*m, "GESession")
-      .def(py::init<const std::map<std::string, std::string> &>())
-      .def("add_graph",
-           (Status (Session::*)(uint32_t, const Graph &)) & Session::AddGraph)
+      .def(py::init([](const std::map<std::string, std::string> &options) {
+        return std::unique_ptr<ge::Session>(
+            new ge::Session(convert_map(options)));
+      }))
+      .def("add_graph", (ge::Status (Session::*)(uint32_t, const Graph &)) &
+                            Session::AddGraph)
       .def("add_graph",
-           (Status (Session::*)(uint32_t, const Graph &,
-                                const std::map<std::string, std::string> &)) &
-               Session::AddGraph)
+           [](Session &ss, uint32_t index, const Graph &graph,
+              const std::map<std::string, std::string> &options) {
+             return ss.AddGraph(index, graph, convert_map(options));
+           })
       .def("remove_graph", &Session::RemoveGraph)
       .def("run_graph",
            [](Session &ss, uint32_t graphId,
               const std::vector<Tensor> &inputs) -> py::tuple {
              std::vector<Tensor> outputs;
-             Status res = ss.RunGraph(graphId, inputs, outputs);
+             ge::Status res = ss.RunGraph(graphId, inputs, outputs);
              return py::make_tuple(outputs, res);
            },
            py::call_guard<py::gil_scoped_release>())
       .def("build_graph", &Session::BuildGraph)
       .def("run_graph_async", &Session::RunGraphAsync)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("register_call_back_func",
+           static_cast<ge::Status (ge::Session::*)(  // NOLINT
+               const char *, const ge::session::pCallBackFunc &)>(
+               &ge::Session::RegisterCallBackFunc))
+#else
       .def("register_call_back_func",
            (Status (Session::*)(  // NOLINT
                const std::string &,
@@ -239,11 +284,12 @@ void BindAscendGraph(py::module *m) {
                    uint32_t graph_id,
                    const std::map<std::string, ge::Tensor> &params_list)>)) &
                Session::RegisterCallBackFunc)
+#endif
       .def("is_graph_need_rebuild", &Session::IsGraphNeedRebuild);
 
   py::class_<Graph>(*m, "GEGraph")
       .def(py::init<>())
-      .def(py::init<const std::string &>())
+      .def(py::init<const char *>())
       .def("set_inputs", &Graph::SetInputs)
       .def("set_outputs", (Graph & (Graph::*)(const std::vector<Operator> &)) &
                               Graph::SetOutputs)
@@ -253,40 +299,70 @@ void BindAscendGraph(py::module *m) {
                Graph::SetOutputs)
       .def("set_outputs",
            (Graph &
-            (Graph::*)(const std::vector<std::pair<ge::Operator, std::string>>
+            (Graph::*)(const std::vector<std::pair<ge::Operator, AscendString>>
                            &)) &
                Graph::SetOutputs)
       .def("set_targets", &Graph::SetTargets)
       .def("is_valid", &Graph::IsValid)
       .def("add_op", &Graph::AddOp)
       .def("find_op_by_name",
-           [](Graph &graph, const std::string &name) -> py::tuple {
+           [](Graph &graph, const char *name) -> py::tuple {
              ge::Operator op;
              graphStatus status = graph.FindOpByName(name, op);
              return py::make_tuple(op, status);
            })
       .def("find_op_by_type",
-           [](Graph &graph, const std::string &type) -> py::tuple {
+           [](Graph &graph, const char *type) -> py::tuple {
              std::vector<ge::Operator> ops;
              graphStatus status = graph.FindOpByType(type, ops);
              return py::make_tuple(ops, status);
            })
       .def("get_all_op_name",
            [](Graph &graph) -> py::tuple {
-             std::vector<std::string> op_name;
+             std::vector<AscendString> op_name;
              graphStatus status = graph.GetAllOpName(op_name);
              return py::make_tuple(op_name, status);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("save_to_file",
+           static_cast<ge::graphStatus (ge::Graph::*)(const char *) const>(
+               &ge::Graph::SaveToFile))
+      .def("load_from_file",
+           static_cast<ge::graphStatus (ge::Graph::*)(const char *)>(
+               &Graph::LoadFromFile))
+      .def("get_name",
+           static_cast<ge::graphStatus (ge::Graph::*)(AscendString &) const>(
+               &Graph::GetName))
+#else
       .def("save_to_file", &Graph::SaveToFile)
       .def("load_from_file", &Graph::LoadFromFile)
       .def("get_name", &Graph::GetName)
+#endif
       .def("set_need_iteration", &Graph::SetNeedIteration);
 
   py::class_<Operator>(*m, "GEOperator")
       .def(py::init<>())
-      .def(py::init<const std::string &>())
-      .def(py::init<const std::string &, const std::string &>())
+      .def(py::init<const char *>())
+      .def(py::init<const char *, const char *>())
       .def("is_empty", &Operator::IsEmpty)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_name",
+           static_cast<ge::graphStatus (ge::Operator::*)(AscendString &) const>(
+               &Operator::GetName))
+      .def("get_op_type",
+           static_cast<ge::graphStatus (ge::Operator::*)(AscendString &) const>(
+               &Operator::GetOpType))
+      .def("set_input",
+           (Operator & (Operator::*)(const char *, const Operator &)) &
+               Operator::SetInput)
+      .def("set_input",
+           (Operator &
+            (Operator::*)(const char *, const Operator &, const char *)) &
+               Operator::SetInput)
+      .def("set_input", (Operator & (Operator::*)(const char *,
+                                                  const Operator &, uint32_t)) &
+                            Operator::SetInput)
+#else
       .def("get_name", &Operator::GetName)
       .def("get_op_type", &Operator::GetOpType)
       .def("set_input",
@@ -299,13 +375,28 @@ void BindAscendGraph(py::module *m) {
       .def("set_input", (Operator & (Operator::*)(const std::string &,
                                                   const Operator &, uint32_t)) &
                             Operator::SetInput)
+#endif
       .def("add_control_input", &Operator::AddControlInput)
       .def("get_input_const_data",
-           [](Operator &op, const std::string &dst_name) -> py::tuple {
+           [](Operator &op, const char *dst_name) -> py::tuple {
              Tensor data;
              graphStatus res = op.GetInputConstData(dst_name, data);
              return py::make_tuple(data, res);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_input_desc",
+           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
+      .def("get_input_desc",
+           [](Operator &op, const std::string &name) {
+             return op.GetInputDescByName(name.c_str());
+           })
+      .def("get_dynamic_output_num",
+           static_cast<int (ge::Operator::*)(const char *) const>(
+               &Operator::GetDynamicOutputNum))
+      .def("get_dynamic_input_num",
+           static_cast<int (ge::Operator::*)(const char *) const>(
+               &Operator::GetDynamicInputNum))
+#else
       .def("get_input_desc",
            (TensorDesc (Operator::*)(const std::string &) const) &
                Operator::GetInputDesc)
@@ -313,12 +404,41 @@ void BindAscendGraph(py::module *m) {
            (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
       .def("get_dynamic_output_num", &Operator::GetDynamicOutputNum)
       .def("get_dynamic_input_num", &Operator::GetDynamicInputNum)
+#endif
       .def("try_get_input_desc",
-           [](Operator &op, const std::string &name) -> py::tuple {
+           [](Operator &op, const char *name) -> py::tuple {
              TensorDesc tensor_desc;
              graphStatus status = op.TryGetInputDesc(name, tensor_desc);
              return py::make_tuple(tensor_desc, status);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("update_input_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               const char *, const TensorDesc &)>(&Operator::UpdateInputDesc))
+      .def("get_output_desc",
+           [](Operator &op, const std::string &name) {
+             return op.GetOutputDescByName(name.c_str());
+           })
+      .def("get_output_desc",
+           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
+      .def("update_output_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               const char *, const TensorDesc &)>(&Operator::UpdateOutputDesc))
+      .def("get_dynamic_input_desc",
+           static_cast<ge::TensorDesc (ge::Operator::*)(const char *, uint32_t)
+                           const>(&Operator::GetDynamicInputDesc))
+      .def("update_dynamic_input_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(const char *, uint32_t,
+                                                         const TensorDesc &)>(
+               &Operator::UpdateDynamicInputDesc))
+      .def("get_dynamic_output_desc",
+           static_cast<ge::TensorDesc (ge::Operator::*)(const char *, uint32_t)
+                           const>(&Operator::GetDynamicOutputDesc))
+      .def("update_dynamic_output_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(const char *, uint32_t,
+                                                         const TensorDesc &)>(
+               &Operator::UpdateDynamicOutputDesc))
+#else
       .def("update_input_desc", &Operator::UpdateInputDesc)
       .def("get_output_desc",
            (TensorDesc (Operator::*)(const std::string &) const) &
@@ -330,33 +450,38 @@ void BindAscendGraph(py::module *m) {
       .def("update_dynamic_input_desc", &Operator::UpdateDynamicInputDesc)
       .def("get_dynamic_output_desc", &Operator::GetDynamicOutputDesc)
       .def("update_dynamic_output_desc", &Operator::UpdateDynamicOutputDesc)
+#endif
       .def("infer_shape_and_type", &Operator::InferShapeAndType)
       .def("set_inference_context", &Operator::SetInferenceContext)
       .def("get_inference_context", &Operator::GetInferenceContext)
       .def("verify_all_attr", &Operator::VerifyAllAttr)
       .def("get_inputs_size", &Operator::GetInputsSize)
       .def("get_outputs_size", &Operator::GetOutputsSize)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_all_attr_names_and_types",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               std::map<AscendString, AscendString> &) const>(
+               &Operator::GetAllAttrNamesAndTypes))
+#else
       .def("get_all_attr_names_and_types", &Operator::GetAllAttrNamesAndTypes)
+#endif
       .def("set_attr_int64",
-           [](Operator &op, const std::string &name,
-              int64_t value) -> Operator & {
+           [](Operator &op, const char *name, int64_t value) -> Operator & {
              int64_t tar = (int64_t)value;
              return op.SetAttr(name, tar);
            })
       .def("set_attr_int32",
-           [](Operator &op, const std::string &name,
-              int32_t value) -> Operator & {
+           [](Operator &op, const char *name, int32_t value) -> Operator & {
              int32_t tar = (int32_t)value;
              return op.SetAttr(name, tar);
            })
       .def("set_attr_uint32",
-           [](Operator &op, const std::string &name,
-              uint32_t value) -> Operator & {
+           [](Operator &op, const char *name, uint32_t value) -> Operator & {
              uint32_t tar = (uint32_t)value;
              return op.SetAttr(name, tar);
            })
       .def("set_attr_vec_int64",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<int64_t> &value) -> Operator & {
              int len = value.size();
              std::vector<int64_t> tar;
@@ -368,7 +493,7 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_vec_int32",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<int32_t> &value) -> Operator & {
              int len = value.size();
              std::vector<int32_t> tar;
@@ -380,7 +505,7 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_vec_uint32",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<uint32_t> &value) -> Operator & {
              int len = value.size();
              std::vector<uint32_t> tar;
@@ -392,21 +517,20 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_list_int64",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               std::initializer_list<int64_t> &attrValue) -> Operator & {
              return op.SetAttr(name, std::move(attrValue));
            })
       .def("set_attr_attrvalue",
-           [](Operator &op, const std::string &name, AttrValue &attrValue)
+           [](Operator &op, const char *name, AttrValue &attrValue)
                -> Operator & { return op.SetAttr(name, std::move(attrValue)); })
-      .def(
-          "set_attr_float",
-          [](Operator &op, const std::string &name, float value) -> Operator & {
-            float tar = static_cast<float>(value);
-            return op.SetAttr(name, tar);
-          })
+      .def("set_attr_float",
+           [](Operator &op, const char *name, float value) -> Operator & {
+             float tar = static_cast<float>(value);
+             return op.SetAttr(name, tar);
+           })
       .def("set_attr_vec_float",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<float> &value) -> Operator & {
              int len = value.size();
              std::vector<float> tar;
@@ -417,6 +541,15 @@ void BindAscendGraph(py::module *m) {
              }
              return op.SetAttr(name, tar);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_attr_string",
+           (Operator & (Operator::*)(const char *, const char *)) &
+               Operator::SetAttr)
+      .def("set_attr_vec_string",
+           (Operator &
+            (Operator::*)(const char *, const std::vector<AscendString> &)) &
+               Operator::SetAttr)
+#else
       .def("set_attr_string", (Operator & (Operator::*)(const std::string &,
                                                         const std::string &)) &
                                   Operator::SetAttr)
@@ -424,15 +557,16 @@ void BindAscendGraph(py::module *m) {
            (Operator & (Operator::*)(const std::string &,
                                      const std::vector<std::string> &)) &
                Operator::SetAttr)
+#endif
       .def("set_attr_bool",
-           [](Operator &op, const std::string &name, bool value) -> Operator & {
+           [](Operator &op, const char *name, bool value) -> Operator & {
              if (value)
                return op.SetAttr(name, true);
              else
                return op.SetAttr(name, false);
            })
       .def("set_attr_vec_bool",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<bool> &value) -> Operator & {
              int len = value.size();
              std::vector<bool> tar;
@@ -444,6 +578,15 @@ void BindAscendGraph(py::module *m) {
              }
              return op.SetAttr(name, tar);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_attr_tensor",
+           (Operator & (Operator::*)(const char *, const Tensor &)) &
+               Operator::SetAttr)
+      .def("set_attr_vec_tensor",
+           (Operator &
+            (Operator::*)(const char *, const std::vector<Tensor> &)) &
+               Operator::SetAttr)
+#else
       .def("set_attr_tensor",
            (Operator & (Operator::*)(const std::string &, const Tensor &)) &
                Operator::SetAttr)
@@ -451,8 +594,9 @@ void BindAscendGraph(py::module *m) {
            (Operator &
             (Operator::*)(const std::string &, const std::vector<Tensor> &)) &
                Operator::SetAttr)
+#endif
       .def("set_attr_vec_uint8",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<uint8_t> &value) -> Operator & {
              int len = value.size();
              std::vector<uint8_t> tar;
@@ -463,13 +607,21 @@ void BindAscendGraph(py::module *m) {
              }
              return op.SetAttr(name, tar);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_attr_vec_vec_int64",
+           (Operator &
+            (Operator::*)(const char *,
+                          const std::vector<std::vector<int64_t>> &)) &
+               Operator::SetAttr)
+#else
       .def("set_attr_vec_vec_int64",
            (Operator &
             (Operator::*)(const std::string &,
                           const std::vector<std::vector<int64_t>> &)) &
                Operator::SetAttr)
+#endif
       .def("set_attr_vec_dtype",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<DataType> &value) -> Operator & {
              int len = value.size();
              std::vector<ge::DataType> tar;
@@ -481,15 +633,13 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_dtype",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const DataType &value) -> Operator & {
              ge::DataType tar = (ge::DataType)value;
              return op.SetAttr(name, tar);
            })
-
       .def("get_attr",
-           [](Operator &op, const std::string &name,
-              AttrType type) -> py::tuple {
+           [](Operator &op, const char *name, AttrType type) -> py::tuple {
              graphStatus res = -1;
              switch (type) {
                case AT_INT64: {
@@ -538,12 +688,12 @@ void BindAscendGraph(py::module *m) {
                  return py::make_tuple(o_av, res);
                } break;
                case AT_STRING: {
-                 std::string s_av;
+                 AscendString s_av;
                  res = op.GetAttr(name, s_av);
                  return py::make_tuple(s_av, res);
                } break;
                case AT_LIST_STRING: {
-                 std::vector<std::string> v_s_av;
+                 std::vector<AscendString> v_s_av;
                  res = op.GetAttr(name, v_s_av);
                  return py::make_tuple(v_s_av, res);
                } break;
@@ -594,11 +744,31 @@ void BindAscendGraph(py::module *m) {
            })
       .def("break_connect", &Operator::BreakConnect)
       .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_subgraph_names",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               std::vector<AscendString> &) const>(&Operator::GetSubgraphNames))
+      .def("get_subgraph_builder",
+           static_cast<ge::SubgraphBuilder (ge::Operator::*)(const char *)
+                           const>(&Operator::GetSubgraphBuilder))
+      .def("get_subgraph",
+           static_cast<ge::Graph (ge::Operator::*)(const char *) const>(
+               &Operator::GetSubgraph))
+      .def("get_dynamic_subgraph_builder",
+           static_cast<ge::SubgraphBuilder (ge::Operator::*)(const char *,
+                                                             uint32_t) const>(
+               &Operator::GetDynamicSubgraphBuilder))
+      .def("get_dynamic_subgraph",
+           static_cast<ge::Graph (ge::Operator::*)(const char *, uint32_t)
+                           const>(&Operator::GetDynamicSubgraph));
+#else
+      .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount)
       .def("get_subgraph_names", &Operator::GetSubgraphNames)
       .def("get_subgraph_builder", &Operator::GetSubgraphBuilder)
       .def("get_subgraph", &Operator::GetSubgraph)
       .def("get_dynamic_subgraph_builder", &Operator::GetDynamicSubgraphBuilder)
       .def("get_dynamic_subgraph", &Operator::GetDynamicSubgraph);
+#endif
 
   py::class_<Tensor>(*m, "GETensor")
       .def(py::init<>())
@@ -613,10 +783,15 @@ void BindAscendGraph(py::module *m) {
                            Tensor::SetData)
       .def("set_data",
            (graphStatus (Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_data",
+           (graphStatus (Tensor::*)(const char *)) & Tensor::SetData)
+#else
       .def("set_data",
            (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData)
+#endif
       .def("set_data",
-           (graphStatus (Tensor::*)(const std::vector<std::string> &)) &
+           (graphStatus (Tensor::*)(const std::vector<AscendString> &)) &
                Tensor::SetData)
 
       .def("get_data",
@@ -638,8 +813,8 @@ void BindAscendGraph(py::module *m) {
       .def(py::init<Shape, Format, DataType>(), py::arg("shape"),
            py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT)
       .def(py::init<const TensorDesc &>())
-      .def("update",
-           (void (TensorDesc::*)(Shape, Format, DataType)) & TensorDesc::Update,
+      .def("update", (void (TensorDesc::*)(const Shape &, Format, DataType)) &
+                         TensorDesc::Update,
            py::arg("shape"), py::arg("format") = FORMAT_ND,
            py::arg("dt") = DT_FLOAT)
       .def("set_shape", &TensorDesc::SetShape)
@@ -660,8 +835,16 @@ void BindAscendGraph(py::module *m) {
       .def("get_origin_format", &TensorDesc::GetOriginFormat)
       .def("set_data_type", &TensorDesc::SetDataType)
       .def("get_data_type", &TensorDesc::GetDataType)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_name", static_cast<void (ge::TensorDesc::*)(const char *)>(
+                           &TensorDesc::SetName))
+      .def("get_name",
+           static_cast<ge::graphStatus (ge::TensorDesc::*)(AscendString &)>(
+               &TensorDesc::GetName))
+#else
       .def("set_name", &TensorDesc::SetName)
       .def("get_name", &TensorDesc::GetName)
+#endif
       .def("set_size", &TensorDesc::SetSize)
       .def("get_size", &TensorDesc::GetSize)
       .def("set_real_dim_cnt", &TensorDesc::SetRealDimCnt)
@@ -679,16 +862,27 @@ void BindAscendGraph(py::module *m) {
   py::class_<AttrValue>(*m, "GEAttrValue").def(py::init<>());
 
   py::class_<OperatorFactory>(*m, "GEOperatorFactory")
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def_static("create_operator",
+                  static_cast<ge::Operator (*)(const char *, const char *)>(
+                      &ge::OperatorFactory::CreateOperator))
+#else
       .def("create_operator", &OperatorFactory::CreateOperator)
+#endif
       .def("get_ops_type_list",
            []() -> py::tuple {
-             std::vector<std::string> all_ops;
+             std::vector<AscendString> all_ops;
              graphStatus status = OperatorFactory::GetOpsTypeList(all_ops);
              return py::make_tuple(all_ops, status);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def_static("is_exist_op", static_cast<bool (*)(const char *)>(
+                                     &OperatorFactory::IsExistOp));
+#else
       .def("is_exist_op", &OperatorFactory::IsExistOp);
+#endif
 }
 
-}  // end namespace pybind
-}  // end namespace paddle
+}  // namespace pybind
+}  // namespace paddle
 #endif
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h
index 4af96d6ef4b92ac43b0c115dc4e4138274fe429c..15fb056c90e020a67f5b94373e158d117547d4d3 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.h
+++ b/paddle/fluid/pybind/ascend_wrapper_py.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
@@ -25,6 +25,7 @@ namespace pybind {
 
 void BindAscendGraph(py::module* m);
 void BindAscendWrapper(py::module* m);
+void BindAscendDevice(py::module* m);
 
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index ba716fb3b550ac0bc0bd09b362248de5904edc7a..91461aa26f341a91f942fc44a70064fa49ece31c 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -30,8 +30,12 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/communicator_common.h"
 #include "paddle/fluid/distributed/fleet.h"
+#include "paddle/fluid/distributed/index_dataset/index_sampler.h"
+#include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
 #include "paddle/fluid/distributed/service/communicator.h"
 #include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_py_service.h"
 #include "paddle/fluid/distributed/service/heter_client.h"
 
 namespace py = pybind11;
@@ -39,6 +43,11 @@ using paddle::distributed::CommContext;
 using paddle::distributed::Communicator;
 using paddle::distributed::FleetWrapper;
 using paddle::distributed::HeterClient;
+using paddle::distributed::GraphPyService;
+using paddle::distributed::GraphNode;
+using paddle::distributed::GraphPyServer;
+using paddle::distributed::GraphPyClient;
+using paddle::distributed::FeatureNode;
 
 namespace paddle {
 namespace pybind {
@@ -152,5 +161,129 @@ void BindHeterClient(py::module* m) {
       .def("stop", &HeterClient::Stop);
 }
 
+void BindGraphNode(py::module* m) {
+  py::class_<GraphNode>(*m, "GraphNode")
+      .def(py::init<>())
+      .def("get_id", &GraphNode::get_id)
+      .def("get_feature", &GraphNode::get_feature);
+}
+void BindGraphPyFeatureNode(py::module* m) {
+  py::class_<FeatureNode>(*m, "FeatureNode")
+      .def(py::init<>())
+      .def("get_id", &GraphNode::get_id)
+      .def("get_feature", &GraphNode::get_feature);
+}
+
+void BindGraphPyService(py::module* m) {
+  py::class_<GraphPyService>(*m, "GraphPyService").def(py::init<>());
+}
+
+void BindGraphPyServer(py::module* m) {
+  py::class_<GraphPyServer>(*m, "GraphPyServer")
+      .def(py::init<>())
+      .def("start_server", &GraphPyServer::start_server)
+      .def("set_up", &GraphPyServer::set_up)
+      .def("add_table_feat_conf", &GraphPyServer::add_table_feat_conf);
+}
+void BindGraphPyClient(py::module* m) {
+  py::class_<GraphPyClient>(*m, "GraphPyClient")
+      .def(py::init<>())
+      .def("load_edge_file", &GraphPyClient::load_edge_file)
+      .def("load_node_file", &GraphPyClient::load_node_file)
+      .def("set_up", &GraphPyClient::set_up)
+      .def("add_table_feat_conf", &GraphPyClient::add_table_feat_conf)
+      .def("pull_graph_list", &GraphPyClient::pull_graph_list)
+      .def("start_client", &GraphPyClient::start_client)
+      .def("batch_sample_neighboors", &GraphPyClient::batch_sample_neighboors)
+      .def("random_sample_nodes", &GraphPyClient::random_sample_nodes)
+      .def("stop_server", &GraphPyClient::stop_server)
+      .def("get_node_feat",
+           [](GraphPyClient& self, std::string node_type,
+              std::vector<uint64_t> node_ids,
+              std::vector<std::string> feature_names) {
+             auto feats =
+                 self.get_node_feat(node_type, node_ids, feature_names);
+             std::vector<std::vector<py::bytes>> bytes_feats(feats.size());
+             for (int i = 0; i < feats.size(); ++i) {
+               for (int j = 0; j < feats[i].size(); ++j) {
+                 bytes_feats[i].push_back(py::bytes(feats[i][j]));
+               }
+             }
+             return bytes_feats;
+           })
+      .def("bind_local_server", &GraphPyClient::bind_local_server);
+}
+
+using paddle::distributed::TreeIndex;
+using paddle::distributed::IndexWrapper;
+using paddle::distributed::IndexNode;
+
+void BindIndexNode(py::module* m) {
+  py::class_<IndexNode>(*m, "IndexNode")
+      .def(py::init<>())
+      .def("id", [](IndexNode& self) { return self.id(); })
+      .def("is_leaf", [](IndexNode& self) { return self.is_leaf(); })
+      .def("probability", [](IndexNode& self) { return self.probability(); });
+}
+
+void BindTreeIndex(py::module* m) {
+  py::class_<TreeIndex, std::shared_ptr<TreeIndex>>(*m, "TreeIndex")
+      .def(py::init([](const std::string name, const std::string path) {
+        auto index_wrapper = IndexWrapper::GetInstancePtr();
+        index_wrapper->insert_tree_index(name, path);
+        return index_wrapper->get_tree_index(name);
+      }))
+      .def("height", [](TreeIndex& self) { return self.Height(); })
+      .def("branch", [](TreeIndex& self) { return self.Branch(); })
+      .def("total_node_nums",
+           [](TreeIndex& self) { return self.TotalNodeNums(); })
+      .def("emb_size", [](TreeIndex& self) { return self.EmbSize(); })
+      .def("get_all_leafs", [](TreeIndex& self) { return self.GetAllLeafs(); })
+      .def("get_nodes",
+           [](TreeIndex& self, const std::vector<uint64_t>& codes) {
+             return self.GetNodes(codes);
+           })
+      .def("get_layer_codes",
+           [](TreeIndex& self, int level) { return self.GetLayerCodes(level); })
+      .def("get_ancestor_codes",
+           [](TreeIndex& self, const std::vector<uint64_t>& ids, int level) {
+             return self.GetAncestorCodes(ids, level);
+           })
+      .def("get_children_codes",
+           [](TreeIndex& self, uint64_t ancestor, int level) {
+             return self.GetChildrenCodes(ancestor, level);
+           })
+      .def("get_travel_codes",
+           [](TreeIndex& self, uint64_t id, int start_level) {
+             return self.GetTravelCodes(id, start_level);
+           });
+}
+
+void BindIndexWrapper(py::module* m) {
+  py::class_<IndexWrapper, std::shared_ptr<IndexWrapper>>(*m, "IndexWrapper")
+      .def(py::init([]() { return IndexWrapper::GetInstancePtr(); }))
+      .def("insert_tree_index", &IndexWrapper::insert_tree_index)
+      .def("get_tree_index", &IndexWrapper::get_tree_index)
+      .def("clear_tree", &IndexWrapper::clear_tree);
+}
+
+using paddle::distributed::IndexSampler;
+using paddle::distributed::LayerWiseSampler;
+
+void BindIndexSampler(py::module* m) {
+  py::class_<IndexSampler, std::shared_ptr<IndexSampler>>(*m, "IndexSampler")
+      .def(py::init([](const std::string& mode, const std::string& name) {
+        if (mode == "by_layerwise") {
+          return IndexSampler::Init<LayerWiseSampler>(name);
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Unsupported IndexSampler Type!"));
+        }
+      }))
+      .def("init_layerwise_conf", &IndexSampler::init_layerwise_conf)
+      .def("init_beamsearch_conf", &IndexSampler::init_beamsearch_conf)
+      .def("sample", &IndexSampler::sample);
+}
+
 }  // end namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h
index 7f471598ad28182ef56d894948191d3596d24752..206a69f5a80197b15b5f579faefdad2075461c2c 100644
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
@@ -27,6 +27,14 @@ void BindPSHost(py::module* m);
 void BindCommunicatorContext(py::module* m);
 void BindDistCommunicator(py::module* m);
 void BindHeterClient(py::module* m);
-
+void BindGraphNode(py::module* m);
+void BindGraphPyService(py::module* m);
+void BindGraphPyFeatureNode(py::module* m);
+void BindGraphPyServer(py::module* m);
+void BindGraphPyClient(py::module* m);
+void BindIndexNode(py::module* m);
+void BindTreeIndex(py::module* m);
+void BindIndexWrapper(py::module* m);
+void BindIndexSampler(py::module* m);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index 6074d191ad2be0b318d76d5eea33185dd3022a6c..4824a34e843bb1eb3074ad59554a3adb61f99554 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -41,6 +41,7 @@ DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
 DECLARE_int32(call_stack_level);
 DECLARE_bool(sort_sum_gradient);
+DECLARE_bool(check_kernel_launch);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -72,6 +73,7 @@ DECLARE_uint64(conv_workspace_size_limit);
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 DECLARE_bool(cudnn_deterministic);
 DECLARE_bool(cudnn_exhaustive_search);
+DECLARE_bool(conv2d_disable_cudnn);
 // data processing
 DECLARE_bool(enable_cublas_tensor_op_math);
 // device management
@@ -87,10 +89,17 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 // others
 DECLARE_bool(sync_nccl_allreduce);
 #endif
+
 #ifdef PADDLE_WITH_XPU
 // device management
 DECLARE_string(selected_xpus);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+// device management
+DECLARE_string(selected_npus);
+#endif
+
 #ifdef PADDLE_WITH_DISTRIBUTE
 DECLARE_int32(rpc_send_thread_num);
 DECLARE_int32(rpc_get_thread_num);
@@ -367,11 +376,17 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_fraction_of_cuda_pinned_memory_to_use,
       FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb,
       FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
-      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce);
+      FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce,
+      FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch);
 #endif
 #ifdef PADDLE_WITH_XPU
   REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus);
+#endif
+
 #ifdef PADDLE_WITH_DITRIBUTE
   REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num,
                              FLAGS_rpc_get_thread_num,
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 58ef177863093dd75a3d700f7ba0079573365707..450c992d41118a6b00c359edcf28fc239e56e768 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -34,10 +34,12 @@ limitations under the License. */
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/bkcl_context.h"
 #include "paddle/fluid/imperative/data_loader.h"
+#include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/imperative/partial_grad_engine.h"
 #include "paddle/fluid/imperative/profiler.h"
+#include "paddle/fluid/imperative/py_layer_fwd.h"
 #include "paddle/fluid/imperative/reducer.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/imperative/type_defs.h"
@@ -63,6 +65,65 @@ class Layer : public imperative::Layer {
   }
 };
 
+template <typename T>
+static T PyObjectCast(PyObject *obj) {
+  try {
+    return py::cast<T>(py::handle(obj));
+  } catch (py::cast_error &) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Python object is not type of %s", typeid(T).name()));
+  }
+}
+
+class PyVariableWrapperHook : public imperative::VariableWrapperHook {
+ public:
+  explicit PyVariableWrapperHook(PyObject *func) : py_func_(func) {
+    Py_INCREF(py_func_);
+  }
+
+  ~PyVariableWrapperHook() {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(py_func_);
+  }
+
+  std::shared_ptr<imperative::VariableWrapper> operator()(
+      const std::shared_ptr<imperative::VariableWrapper> &var) override {
+    py::gil_scoped_acquire gil;
+    VLOG(3) << "Call PyVariableWrapperHook for var " << var->Name();
+
+    // 1. unpack temp VarBase from VariableWrapper
+    std::shared_ptr<imperative::VarBase> tmp_varbase =
+        std::make_shared<imperative::VarBase>(var);
+
+    // 2. call hook and return
+    PyObject *res = nullptr;
+    try {
+      res = PyObject_CallFunctionObjArgs(py_func_, py::cast(tmp_varbase).ptr(),
+                                         nullptr);
+    } catch (platform::EnforceNotMet &e) {
+      throw std::move(e);
+    } catch (std::exception &e) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Hook function of Tensor raises an exception: %s.", e.what()));
+    } catch (...) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Hook function of Tensor raises an unknown exception."));
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(res,
+                            platform::errors::Unavailable(
+                                "Hook function of Tensor return a nullptr."));
+    if (res == Py_None) {
+      return var;
+    }
+
+    return PyObjectCast<std::shared_ptr<imperative::VarBase>>(res)->SharedVar();
+  }
+
+ private:
+  PyObject *py_func_;
+};
+
 static const platform::Place PyObjectToPlace(const py::object &place_obj) {
   if (py::isinstance<platform::CPUPlace>(place_obj)) {
     return place_obj.cast<platform::CPUPlace>();
@@ -213,16 +274,6 @@ static std::string GetTypeName(const imperative::VarBase &var) {
 
 using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
 
-template <typename T>
-static T PyObjectCast(PyObject *obj) {
-  try {
-    return py::cast<T>(py::handle(obj));
-  } catch (py::cast_error &) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Python object is not type of %s", typeid(T).name()));
-  }
-}
-
 // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
 // Unlike py::object, py::handle does not change reference count of PyObject *.
 static std::vector<std::shared_ptr<imperative::VarBase>>
@@ -494,6 +545,39 @@ void BindImperative(py::module *m_ptr) {
       },
       py::return_value_policy::take_ownership);
 
+  m.def("_array_to_share_memory_tensor",
+        [](py::object &obj) {
+          // 1. cast to python array
+          auto array = obj.cast<py::array>();
+          PADDLE_ENFORCE_NE(
+              string::Sprintf("%s", array.dtype()).compare("object"), 0,
+              platform::errors::InvalidArgument(
+                  "Faild to convert input data to a regular ndarray.\n  * "
+                  "Usually this means the input data contains nested "
+                  "lists with different lengths.\n  * Check the reader "
+                  "function passed to 'set_(sample/sample_list/batch)"
+                  "_generator' to locate the data causes this issue."));
+          // 2. construcct LoDTensor
+          framework::LoDTensor t;
+          SetTensorFromPyArray<platform::CPUPlace>(&t, array,
+                                                   platform::CPUPlace(), true);
+          // 3. allocate shared memory
+          void *data_ptr = t.data<void>();
+          size_t data_size = t.numel() * framework::SizeOfType(t.type());
+          auto shared_writer_holder =
+              memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
+          // 4. maintain mmap fd set & backup ipc_name
+          const std::string &ipc_name = shared_writer_holder->ipc_name();
+          memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+          // 5. copy data & reset holder
+          memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                       platform::CPUPlace(), data_ptr, data_size);
+          t.ResetHolder(shared_writer_holder);
+
+          return t;
+        },
+        py::return_value_policy::take_ownership);
+
   m.def("_remove_tensor_list_mmap_fds", [](py::list &tensor_list) {
     for (size_t i = 0; i < tensor_list.size(); ++i) {
       auto t = tensor_list[i].cast<framework::LoDTensor>();
@@ -611,15 +695,17 @@ void BindImperative(py::module *m_ptr) {
              // TODO(liym27): Try not to call TensorToPyArray because it always
              // copys data to cpu place, which reduces performance.
              if (parse_index && value_is_tensor) {
-               std::vector<int> axes, starts, ends, steps, decrease_axis,
+               std::vector<int> axes, starts, ends, steps, decrease_axes,
                    infer_flags;
                ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends,
-                                  &steps, &decrease_axis, &infer_flags);
+                                  &steps, &decrease_axes, &infer_flags);
 
-               framework::AttributeMap attrs = {{"axes", axes},
-                                                {"starts", starts},
-                                                {"ends", ends},
-                                                {"steps", steps}};
+               framework::AttributeMap attrs = {
+                   {"axes", axes},
+                   {"starts", starts},
+                   {"ends", ends},
+                   {"steps", steps},
+                   {"decrease_axes", decrease_axes}};
 
                imperative::NameVarBaseMap ins = {{"Input", {self}}};
                imperative::NameVarBaseMap outs = {{"Out", {self}}};
@@ -632,7 +718,8 @@ void BindImperative(py::module *m_ptr) {
                {
                  // Release gil and do tracing
                  py::gil_scoped_release release;
-                 tracer->TraceOp("set_value", ins, outs, std::move(attrs));
+                 tracer->TraceOp("set_value", ins, outs, std::move(attrs),
+                                 {{"Input", "Out"}});
                }
              } else {
                auto self_numpy = TensorToPyArray(*self_tensor);
@@ -659,7 +746,7 @@ void BindImperative(py::module *m_ptr) {
              // inplace operator for the VarBase self.
              self->BumpInplaceVersion();
            })
-      .def("__getitem__",
+      .def("_getitem_index_not_tensor",
            [](std::shared_ptr<imperative::VarBase> &self, py::handle _index) {
              std::vector<int> slice_axes, slice_starts, slice_ends,
                  slice_strides, decrease_axis, infer_flags;
@@ -697,6 +784,70 @@ void BindImperative(py::module *m_ptr) {
                return out;
              }
            })
+      .def(
+          "_getitem_from_offset",
+          [](std::shared_ptr<imperative::VarBase> &self, const py::args &args) {
+            const auto &tensor = self->Var().Get<framework::LoDTensor>();
+            PADDLE_ENFORCE_EQ(
+                tensor.IsInitialized(), true,
+                platform::errors::InvalidArgument(
+                    "Tensor of %s is Empty, please check if it has no data.",
+                    self->Name()));
+
+            const auto &tensor_dims = tensor.dims();
+
+            std::vector<size_t> dims(tensor_dims.size());
+            std::vector<size_t> strides(tensor_dims.size());
+
+            size_t numel = 1;
+            for (int i = tensor_dims.size() - 1; i >= 0; --i) {
+              strides[i] = numel;
+              dims[i] = static_cast<size_t>(tensor_dims[i]);
+              numel *= dims[i];
+            }
+            size_t offset = 0;
+            if (args.empty()) {
+              PADDLE_ENFORCE_EQ(
+                  numel, 1,
+                  platform::errors::InvalidArgument(
+                      "only one element tensors can be converted to Python "
+                      "scalars when no input coordinates"));
+            } else if (args.size() == 1) {
+              offset = args[0].cast<size_t>();
+              PADDLE_ENFORCE_LT(
+                  offset, numel,
+                  platform::errors::InvalidArgument(
+                      "index %d is out of bounds for size %d", offset, numel));
+            } else {
+              PADDLE_ENFORCE_EQ(args.size(), dims.size(),
+                                platform::errors::InvalidArgument(
+                                    "incorrect number of indices for Tensor"));
+
+              for (size_t i = 0; i < args.size(); ++i) {
+                size_t index = args[i].cast<size_t>();
+                PADDLE_ENFORCE_LT(
+                    index, dims[i],
+                    platform::errors::InvalidArgument(
+                        "index %d is out fo bounds for axis %d with size %d",
+                        index, i, dims[i]));
+                offset += index * strides[i];
+              }
+            }
+#define TENSOR_TO_PY_SCALAR(T, proto_type)                                   \
+  if (tensor.type() == proto_type) {                                         \
+    std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(proto_type); \
+    T b = TensorGetElement<T>(tensor, offset);                               \
+    return py::array(py::dtype(py_dtype_str.c_str()), {}, {},                \
+                     static_cast<void *>(&b));                               \
+  }
+
+            _ForEachDataType_(TENSOR_TO_PY_SCALAR);
+#undef TENSOR_TO_PY_SCALAR
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported tensor data type: %s",
+                framework::DataTypeToString(tensor.type())));
+          },
+          py::return_value_policy::copy)
       .def("_inplace_version",
            [](imperative::VarBase &self) -> uint32_t {
              const auto &var = self.MutableVar();
@@ -720,6 +871,7 @@ void BindImperative(py::module *m_ptr) {
          Bump the version whenever the Tensor is modified through an inplace operation.
             )DOC")
       .def("numpy",
+
            [](imperative::VarBase &self) -> py::array {
              const auto &tensor =
                  self.MutableVar()->Get<framework::LoDTensor>();
@@ -918,18 +1070,6 @@ void BindImperative(py::module *m_ptr) {
               print(x.stop_gradient) # True
               print(x.grad)          # None
        )DOC")
-      .def("_run_backward",
-           [](imperative::VarBase &self, const imperative::Tracer &tracer,
-              bool retain_graph) {
-             // TODO(jiabin): when we impl more backward execution we can
-             // select them
-             auto *engine = tracer.GetEngine();
-             engine->Init(&self, retain_graph);
-             VLOG(3) << "Start backward";
-             engine->Execute();
-             VLOG(3) << "Finish backward";
-           },
-           py::call_guard<py::gil_scoped_release>())
       .def("_grad_name", &imperative::VarBase::GradVarName)
       .def("_grad_value",
            [](imperative::VarBase &self) {
@@ -958,6 +1098,10 @@ void BindImperative(py::module *m_ptr) {
              return std::shared_ptr<imperative::VarBase>(nullptr);
            },
            py::return_value_policy::copy)
+      .def("_set_grad_ivar",
+           [](imperative::VarBase &self, imperative::VarBase &grad) {
+             self.SetGradVarBase(grad);
+           })
       .def("_is_sparse",
            [](imperative::VarBase &self) {
              return self.Var().IsType<framework::SelectedRows>();
@@ -988,6 +1132,61 @@ void BindImperative(py::module *m_ptr) {
              }
            },
            py::call_guard<py::gil_scoped_release>())
+      .def("_register_grad_hook",
+           [](imperative::VarBase &self, const py::handle &hook) {
+             PADDLE_ENFORCE_EQ(
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot register gradient hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             return self.GradVarBase()->AddVariableWrapperHook(
+                 std::make_shared<PyVariableWrapperHook>(hook.ptr()));
+           })
+      .def("_remove_grad_hook",
+           [](imperative::VarBase &self, int64_t hook_id) {
+             PADDLE_ENFORCE_EQ(
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot remove gradient hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             return self.GradVarBase()->RemoveVariableWrapperHook(hook_id);
+           })
+      .def("_register_backward_hook",
+           [](imperative::VarBase &self, const py::handle &hook) {
+             PADDLE_ENFORCE_EQ(
+                 self.IsLeaf(), true,
+                 platform::errors::InvalidArgument(
+                     "Only can register backward hook for leaf Tensor."));
+             PADDLE_ENFORCE_EQ(
+                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot register backward hook on a Tensor that stop "
+                     "gradient or without gradient."));
+             auto py_func = PyObjectCast<std::function<void()>>(hook.ptr());
+             self.GradVarBase()->AddVoidHook(
+                 std::make_shared<std::function<void()>>(py_func));
+           },
+           R"DOC(
+             Registers a backward hook for current Tensor.
+
+             This hook will be called every time the gradient of current Tensor has been fully calculated.
+
+             There are two differences with `_register_grad_hook`:
+             1. This backward hook will be executed after the gradient accumulation completed across batchs,
+                but the hook registered by `_register_grad_hook` will be executed the gradient accumulation
+                completed in current batch.
+             2. This backward hook function should have the following signature:
+
+                  hook() -> None
+
+                It requires no input and no return value.
+
+             Args:
+                 hook(function): A backward hook to be registered for Tensor.gradient
+
+             Returns:
+                 None
+           )DOC")
       .def("cpu",
            [](const std::shared_ptr<imperative::VarBase> &self) {
              if (platform::is_cpu_place(self->Place())) {
@@ -1109,6 +1308,35 @@ void BindImperative(py::module *m_ptr) {
               y = x.cuda(1)
               print(y.place)        # CUDAPlace(1)
        )DOC")
+      .def("_share_memory",
+           [](const std::shared_ptr<imperative::VarBase> &self) {
+#ifndef _WIN32
+             PADDLE_ENFORCE_EQ(
+                 platform::is_cpu_place(self->Place()), true,
+                 platform::errors::InvalidArgument(
+                     "Sharing memory only support CPU Tensor currently"));
+             // 1. get LoDTensor
+             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+             // 2. allocate shared memory
+             void *data_ptr = t->data<void>();
+             size_t data_size = t->numel() * framework::SizeOfType(t->type());
+             auto shared_writer_holder =
+                 memory::allocation::AllocateMemoryMapWriterAllocation(
+                     data_size);
+             // 3. maintain mmap fd set & backup ipc_name
+             const std::string &ipc_name = shared_writer_holder->ipc_name();
+             memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+             // 4. copy data & reset holder
+             memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                          platform::CPUPlace(), data_ptr, data_size);
+             t->ResetHolder(shared_writer_holder);
+             return *t;
+#else
+             PADDLE_THROW(platform::errors::PermissionDenied(
+                 "Sharing memory in Windows OS is not supported currently"));
+#endif
+           },
+           py::return_value_policy::reference)
       .def("copy_", &imperative::VarBase::CopyFrom)
       .def("_copy_to",
            [](const std::shared_ptr<imperative::VarBase> &self,
@@ -1158,6 +1386,16 @@ void BindImperative(py::module *m_ptr) {
              return new_var;
            },
            py::return_value_policy::copy)
+      .def("_copy_to",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::Place &place, bool blocking) {
+             auto new_var = self->NewVarBase(place, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
+           py::return_value_policy::copy)
       .def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
            py::return_value_policy::reference)
       .def_property("name", &imperative::VarBase::Name,
@@ -1167,22 +1405,28 @@ void BindImperative(py::module *m_ptr) {
                     &imperative::VarBase::SetOverridedStopGradient)
       .def_property("persistable", &imperative::VarBase::Persistable,
                     &imperative::VarBase::SetPersistable)
-      .def_property_readonly(
-          "shape",
-          [](imperative::VarBase &self) {
-            if (self.Var().IsType<framework::LoDTensor>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::LoDTensor>().dims());
-            } else if (self.Var().IsType<framework::SelectedRows>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::SelectedRows>().value().dims());
-            } else {
-              VLOG(2) << "It is meaningless to get shape of "
-                         "variable type "
-                      << GetTypeName(self);
-              return std::vector<int>();
-            }
-          })
+      .def_property_readonly("shape",
+                             [](imperative::VarBase &self) {
+                               if (self.Var().IsType<framework::LoDTensor>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                         .Get<framework::LoDTensor>()
+                                         .dims());
+                               } else if (self.Var()
+                                              .IsType<
+                                                  framework::SelectedRows>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                         .Get<framework::SelectedRows>()
+                                         .value()
+                                         .dims());
+                               } else {
+                                 VLOG(2) << "It is meaningless to get shape of "
+                                            "variable type "
+                                         << GetTypeName(self);
+                                 return std::vector<int>();
+                               }
+                             })
       .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
                              R"DOC(
       Whether a Tensor is leaf Tensor.
@@ -1308,7 +1552,7 @@ void BindImperative(py::module *m_ptr) {
                  allow_ops);
              imperative::AmpOperators::Instance().GetMutableBlockOps()->swap(
                  block_ops);
-             VLOG(4) << "AMP operators changed, "
+             VLOG(5) << "AMP operators changed, "
                      << imperative::AmpOperators::Instance();
            })
       .def("_get_amp_op_list",
@@ -1412,6 +1656,19 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
+  m.def(
+      "dygraph_run_backward",
+      [](const std::vector<std::shared_ptr<imperative::VarBase>> &tensors,
+         const std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensors,
+         bool retain_graph, const imperative::Tracer &tracer) {
+        auto *engine = tracer.GetEngine();
+        engine->Init(tensors, grad_tensors, retain_graph);
+        VLOG(3) << "Start backward";
+        engine->Execute();
+        VLOG(3) << "Finish backward";
+      },
+      py::call_guard<py::gil_scoped_release>());
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
   py::class_<imperative::ParallelContext,
@@ -1441,7 +1698,10 @@ void BindImperative(py::module *m_ptr) {
       m, "NCCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
                     const platform::CUDAPlace &>())
-      .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); });
+      .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); })
+      .def("init_with_ring_id",
+           &imperative::NCCLParallelContext::InitWithRingID,
+           py::arg("ring_id"));
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -1450,8 +1710,34 @@ void BindImperative(py::module *m_ptr) {
       m, "BKCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
                     const platform::XPUPlace &>())
-      .def("init", [](imperative::BKCLParallelContext &self) { self.Init(); });
+      .def("init", [](imperative::BKCLParallelContext &self) { self.Init(); })
+      .def("init_with_ring_id",
+           &imperative::BKCLParallelContext::InitWithRingID,
+           py::arg("ring_id"));
 #endif
+  m.def("pylayer_apply",
+        [](const platform::CPUPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
+
+  m.def("pylayer_apply",
+        [](const platform::CUDAPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
+
+  m.def("pylayer_apply",
+        [](const platform::XPUPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
+
+  m.def("pylayer_apply",
+        [](const platform::CUDAPinnedPlace &place, const py::object &cls,
+           const py::args args, const py::kwargs kwargs) {
+          return imperative::PyLayerApply(place, cls, args, kwargs);
+        });
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index dd9cb65142a3de71fec247185328ebed8c98a03a..8a5ad5852aedf5b157876c5d892d2ac4f42c022d 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -467,7 +467,10 @@ void BindAnalysisConfig(py::module *m) {
       .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
       .def("enable_xpu", &AnalysisConfig::EnableXpu,
-           py::arg("l3_workspace_size"))
+           py::arg("l3_workspace_size") = 16 * 1024 * 1024,
+           py::arg("locked") = false, py::arg("autotune") = true,
+           py::arg("autotune_file") = "", py::arg("precision") = "int16",
+           py::arg("adaptive_seqlen") = false)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("use_xpu", &AnalysisConfig::use_xpu)
@@ -512,6 +515,8 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("dla_core") = 0)
       .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
+      .def("enable_dlnne", &AnalysisConfig::EnableDlnne,
+           py::arg("min_subgraph_size") = 3)
       .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
            py::arg("zero_copy") = false,
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index b1c42d91df504854122d5ec30377a9b76d52115b..bf3c77843219c75f9cf4a75f340eaa71f972991d 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -16,6 +16,9 @@
 #include <fstream>
 #include <iostream>
 #include <string>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
 
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -23,6 +26,9 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#endif
 
 // NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are
 // determined by the OP`s proto automatically, i.e., all the inputs registered
@@ -84,7 +90,8 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"matrix_nms", {"Out", "Index", "RoisNum"}},
     {"distribute_fpn_proposals",
      {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
-    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
     {"multiclass_nms3", {"Out", "NmsRoisNum"}},
     {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
     {"momentum", {"ParamOut", "VelocityOut"}},
@@ -116,12 +123,11 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
     {"accuracy", {"Correct", "Total"}},
     {"fill_constant", {"Out"}},
+    {"recv_v2", {"Out"}},
     {"matmul", {"Out"}},
     {"c_broadcast", {"Out"}},
-    {"c_allreduce_sum", {"Out"}},
-    {"c_allreduce_max", {"Out"}},
-    {"c_allreduce_min", {"Out"}},
-    {"c_allreduce_prod", {"Out"}},
+    {"c_sync_calc_stream", {"Out"}},
+    {"c_sync_comm_stream", {"Out"}},
     {"c_reduce_sum", {"Out"}},
     {"c_reduce_max", {"Out"}},
     {"c_reduce_min", {"Out"}},
@@ -137,7 +143,8 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"check_finite_and_unscale", {"Out", "FoundInfinite"}},
     {"update_loss_scaling",
      {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}},
-    {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"moving_average_abs_max_scale",
+     {"Out", "OutScale", "OutAccum", "OutState"}},
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
     {"rnn", {"DropoutState"}},
@@ -172,16 +179,16 @@ const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableO
 const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
 const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})";
 
-const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(	
-    if (%s != nullptr) {	
-      ins["%s"] = {%s};	
-    }	
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    if (%s != nullptr) {
+      ins["%s"] = {%s};
+    }
 )";
 
-const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(	
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
     if (%s.size() != 0) {
-      ins["%s"] = %s;	
-    }	
+      ins["%s"] = %s;
+    }
 )";
 
 const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
@@ -254,8 +261,8 @@ R"(
     imperative::NameVarBaseMap ins = %s;
     %s
     tracer->TraceOp("%s", ins, outs, attrs, {%s});
-    return %s; 
-  }   
+    return %s;
+  }
 })";
 
 const char* PYBIND_ITEM_TEMPLATE = R"(  %s.def("%s", &%s);)";
@@ -340,7 +347,7 @@ std::string GenerateOpFunctionsBody(
   }
   ins_initializer += "}";
 
-  if (input_args.back() == ',') {
+  if (!input_args.empty() && input_args.back() == ',') {
     input_args.pop_back();
   }
 
@@ -354,6 +361,7 @@ std::string GenerateOpFunctionsBody(
   int outs_num = 0;
   for (auto& output : op_proto->outputs()) {
     auto& out_name = output.name();
+
     // skip those dispensable oututs
     if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
       continue;
@@ -449,7 +457,7 @@ std::string GenerateOpFunctionsBody(
     return_str.pop_back();
   }
   outs_initializer += "}";
-  if (inplace_mapping_str.back() == ',') {
+  if (!inplace_mapping_str.empty() && inplace_mapping_str.back() == ',') {
     inplace_mapping_str.pop_back();
   }
   if (!use_inplace_strategy && FindViewOpMap(op_type)) {
@@ -557,6 +565,11 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
+  ascend_ptr->InitGEForUT();
+#endif
+
   std::vector<std::string> headers{"\"paddle/fluid/imperative/tracer.h\""};
 
   std::ofstream out(argv[1], std::ios::out);
@@ -586,5 +599,10 @@ int main(int argc, char* argv[]) {
       << "} // namespace paddle\n";
 
   out.close();
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  ge::GEFinalize();
+#endif
+
   return 0;
 }
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 06b3f10fefafa81ac1eca712952a18ee80d7e084..6fa49a85423c58061975007f9c2f4467c8d1ad09 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -235,6 +235,7 @@ void BindOpDesc(pybind11::module *m) {
               const std::vector<std::string> &vec_var_name) {
              self.SetOutput(name, vec_var_name);
            })
+      .def("remove_output", &pd::OpDesc::RemoveOutput)
       .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
       .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
       .def("_rename_input", &pd::OpDesc::RenameInput)
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
index 5bff9178fdfa5bff8af8a5f1418d700e463c4422..0c239f8157e5dff03ba71bb018c77b7b5a4b86a6 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc
@@ -32,8 +32,7 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace pybind {
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 void BindPSGPUWrapper(py::module* m) {
   py::class_<framework::PSGPUWrapper, std::shared_ptr<framework::PSGPUWrapper>>(
       *m, "PSGPU")
diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.h b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
index 8bd6ee13cf50bd2791d044ae3da450a5cf028133..ba4f146389ed3ef855336c9af10ccd857886e176 100644
--- a/paddle/fluid/pybind/ps_gpu_wrapper_py.h
+++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.h
@@ -22,8 +22,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
 void BindPSGPUWrapper(py::module* m);
 #endif
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c8ca3bf2c8fa27c94f3c83d0ee79278f14dc2ccd..560d8c892b09f9b6f17136040455ee8469587f53 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -14,11 +14,15 @@ limitations under the License. */
 #include <Python.h>
 
 #include <algorithm>
+#include <cctype>
 #include <cstdlib>
+#include <iterator>
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT // for call_once
 #include <string>
+#include <tuple>
+#include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -29,12 +33,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/io/fs.h"
 #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h"
 #include "paddle/fluid/framework/ir/pass_builder.h"
-#include "paddle/fluid/framework/load_op_lib.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -105,6 +107,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/npu_profiler.h"
+#endif
+
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu_info.h"
 #endif
@@ -145,6 +152,14 @@ bool IsCompiledWithROCM() {
 #endif
 }
 
+bool IsCompiledWithAscend() {
+#ifndef PADDLE_WITH_ASCEND
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithXPU() {
 #ifndef PADDLE_WITH_XPU
   return false;
@@ -153,6 +168,14 @@ bool IsCompiledWithXPU() {
 #endif
 }
 
+bool IsCompiledWithNPU() {
+#ifndef PADDLE_WITH_ASCEND_CL
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithMKLDNN() {
 #ifndef PADDLE_WITH_MKLDNN
   return false;
@@ -183,15 +206,68 @@ bool SupportsBfloat16FastPerformance() {
 #endif
 }
 
+// According to the input `place` and `dtype`, this function returns a tuple
+// consists of three sets:
+// 1) All operators registered in the Paddle framework.
+// 2) All operators supported for `place` and `dtype`.
+// 3) All operators unsupported for `place` and `dtype`.
+// The input `place` is a type of string, which can only be `GPU` or `CPU`.
+// The input `dtype` is a type of paddle::framework::proto::VarType::Type,
+// which can be paddle::framework::proto::VarType::FP16,
+// paddle::framework::proto::VarType::FP32 and so on.
+std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>,
+           std::unordered_set<std::string>>
+OpSupportedInfos(const std::string &place,
+                 framework::proto::VarType::Type dtype) {
+  std::string query_place;
+  std::transform(place.begin(), place.end(), std::back_inserter(query_place),
+                 [](unsigned char c) { return std::toupper(c); });
+  using fn_type = std::add_pointer<bool(const platform::Place &)>::type;
+  std::unordered_map<std::string, fn_type> is_target_place{
+      {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place},
+  };
+  PADDLE_ENFORCE_NE(
+      is_target_place.count(query_place), 0,
+      platform::errors::InvalidArgument(
+          "The argument `place` should be 'GPU' or 'CPU', but get '%s'.",
+          place));
+
+  std::unordered_set<std::string> all_ops;
+  const auto &op_info = framework::OpInfoMap::Instance().map();
+  for (auto it = op_info.begin(); it != op_info.end(); it++) {
+    all_ops.emplace(it->first);
+  }
+
+  std::unordered_set<std::string> supported_ops;
+  auto &all_kernels = framework::OperatorWithKernel::AllOpKernels();
+  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
+    for (auto &kernel_type : it->second) {
+      if (is_target_place[query_place](kernel_type.first.place_) &&
+          kernel_type.first.data_type_ == dtype) {
+        supported_ops.emplace(it->first);
+      }
+    }
+  }
+
+  std::unordered_set<std::string> unsupported_ops;
+  for (auto &op : all_ops) {
+    if (!supported_ops.count(op)) {
+      unsupported_ops.emplace(op);
+    }
+  }
+
+  VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --";
+  VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --";
+  VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size()
+          << " --";
+  return std::make_tuple(std::move(all_ops), std::move(supported_ops),
+                         std::move(unsupported_ops));
+}
+
 bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
 #endif
-
-#ifdef PADDLE_WITH_GRPC
-  return false;
-#endif
-
   return true;
 }
 
@@ -420,7 +496,56 @@ PYBIND11_MODULE(core_noavx, m) {
 #endif
     return tensor;
   });
-
+  m.def("_save_lod_tensor", [](const LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to save variables.", str_file_name));
+    SerializeToStream(fout, tensor);
+
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+  m.def("_load_lod_tensor", [](LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ifstream fin(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to load variables.", str_file_name));
+
+    DeserializeFromStream(fin, &tensor);
+    int64_t tellg = fin.tellg();
+    fin.close();
+    return tellg;
+  });
+  m.def("_save_selected_rows", [](const SelectedRows &selected_rows,
+                                  const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fout), true,
+        platform::errors::Unavailable("Cannot open %s to save SelectedRows.",
+                                      str_file_name));
+
+    SerializeToStream(fout, selected_rows);
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+  m.def("_load_selected_rows",
+        [](SelectedRows &selected_rows, const std::string &str_file_name) {
+          std::ifstream fin(str_file_name, std::ios::binary);
+          PADDLE_ENFORCE_EQ(
+              static_cast<bool>(fin), true,
+              platform::errors::Unavailable(
+                  "Cannot open %s to load SelectedRows.", str_file_name));
+
+          DeserializeFromStream(fin, &selected_rows);
+          int64_t tellg = fin.tellg();
+          fin.close();
+          return tellg;
+        });
   m.def("_save_static_dict",
         [](const std::string &str_file_name, const py::handle &vec_var_list,
            const Scope &scope) {
@@ -573,6 +698,10 @@ PYBIND11_MODULE(core_noavx, m) {
            [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
            })
+      .def("_alloc_float",
+           [](framework::Tensor &self, paddle::platform::NPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
       .def("_alloc_double",
            [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<double>(place);
@@ -620,12 +749,19 @@ PYBIND11_MODULE(core_noavx, m) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
       .def("_clear", &framework::Tensor::clear)
+      .def("_mutable_data",
+           [](framework::Tensor &self, paddle::platform::NPUPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
+           })
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
+      .def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
+           py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
            R"DOC(
@@ -633,7 +769,7 @@ PYBIND11_MODULE(core_noavx, m) {
         
         Args:
           lod (numpy.ndarray): The data to set.
-          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the 
+          place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
           LoDTensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
@@ -1361,6 +1497,18 @@ All parameter, weight, gradient are variables in Paddle.
                     return new paddle::platform::XPUDeviceContext(place);
 #endif
                   })
+        .def_static("create",
+                    [](paddle::platform::NPUPlace& place)
+                        -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_ASCEND_CL
+             PADDLE_THROW(
+                 platform::errors::PermissionDenied(
+                 "Cannot use NPUPlace in CPU/GPU/XPU version, "
+                 "Please recompile or reinstall Paddle with NPU support."));
+#else
+                return new paddle::platform::NPUDeviceContext(place);
+#endif
+        })
       .def_static("create",
                   [](paddle::platform::CUDAPlace& place)
                       -> paddle::platform::DeviceContext* {
@@ -1461,6 +1609,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
       .def("_get_device_id",
@@ -1530,6 +1679,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_XPU
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
 #endif
+
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
     CPUPlace is a descriptor of a device.
     It represents a CPU device on which a tensor will be allocated and a model will run.
@@ -1545,6 +1695,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_type", &PlaceIndex<platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::CPUPlace, platform::NPUPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1582,6 +1733,8 @@ All parameter, weight, gradient are variables in Paddle.
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::XPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::CUDAPinnedPlace, platform::NPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
       .def("_equals",
@@ -1589,6 +1742,65 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
+  // NPUPlace
+  py::class_<platform::NPUPlace>(m, "NPUPlace", R"DOC(
+    NPUPlace is a descriptor of a device.
+    It represents a NPU device on which a tensor will be allocated and a model will run.
+
+    Examples:
+        .. code-block:: python
+          import paddle
+          npu_place = paddle.NPUPlace(0)
+
+        )DOC")
+      .def("__init__",
+           [](platform::NPUPlace &self, int dev_id) {
+#ifdef PADDLE_WITH_ASCEND_CL
+             if (UNLIKELY(dev_id < 0)) {
+               LOG(ERROR) << string::Sprintf(
+                   "Invalid NPUPlace(%d), device id must be 0 or "
+                   "positive integer",
+                   dev_id);
+               std::exit(-1);
+             }
+             if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) {
+               if (platform::GetNPUDeviceCount() == 0) {
+                 LOG(ERROR) << "Cannot use NPU because there is no NPU "
+                               "detected on your "
+                               "machine.";
+                 std::exit(-1);
+               } else {
+                 LOG(ERROR) << string::Sprintf(
+                     "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
+                     "number on your machine is %d",
+                     dev_id, platform::GetNPUDeviceCount(),
+                     platform::GetNPUDeviceCount());
+                 std::exit(-1);
+               }
+             }
+             new (&self) platform::NPUPlace(dev_id);
+#else
+             LOG(ERROR) << string::Sprintf(
+                 "Cannot use NPU because you have installed CPU/GPU version "
+                 "PaddlePaddle.\n"
+                 "If you want to use NPU, please try to install NPU version "
+                 "PaddlePaddle by: pip install paddlepaddle-npu\n"
+                 "If you only have CPU, please change NPUPlace(%d) to be "
+                 "CPUPlace().\n",
+                 dev_id);
+             std::exit(-1);
+#endif
+           })
+      .def("_type", &PlaceIndex<platform::NPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::Place>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CUDAPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::CPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::NPUPlace, platform::NPUPlace>)
+      .def("_equals",
+           &IsSamePlace<platform::NPUPlace, platform::CUDAPinnedPlace>)
+      .def("__str__", string::to_string<const platform::NPUPlace &>);
+
   py::class_<platform::Place>(m, "Place")
       .def(py::init<>())
       .def("_type", &PlaceIndex<platform::Place>)
@@ -1596,6 +1808,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
+      .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
       .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
       .def("is_gpu_place",
            [](platform::Place &self) { return platform::is_gpu_place(self); })
@@ -1603,6 +1816,8 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) { return platform::is_cpu_place(self); })
       .def("is_xpu_place",
            [](platform::Place &self) { return platform::is_xpu_place(self); })
+      .def("is_npu_place",
+           [](platform::Place &self) { return platform::is_npu_place(self); })
       .def("is_cuda_pinned_place",
            [](platform::Place &self) {
              return platform::is_cuda_pinned_place(self);
@@ -1615,6 +1830,10 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self) {
              return BOOST_GET_CONST(platform::XPUPlace, self).device;
            })
+      .def("npu_device_id",
+           [](platform::Place &self) {
+             return BOOST_GET_CONST(platform::NPUPlace, self).device;
+           })
       .def("set_place", [](platform::Place &self,
                            const platform::Place &other) { self = other; })
       .def("set_place",
@@ -1634,6 +1853,10 @@ All parameter, weight, gradient are variables in Paddle.
               const platform::CUDAPinnedPlace &cuda_pinned_place) {
              self = cuda_pinned_place;
            })
+      .def("set_place",
+           [](platform::Place &self, const platform::NPUPlace &npu_place) {
+             self = npu_place;
+           })
       .def("__repr__", string::to_string<const platform::Place &>)
       .def("__str__", string::to_string<const platform::Place &>);
 
@@ -1658,6 +1881,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::XPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::NPUPlace &place) { self.Run(scope, place); })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CUDAPlace &place) { self.Run(scope, place); })
@@ -1753,17 +1979,19 @@ All parameter, weight, gradient are variables in Paddle.
 
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
-  m.def("load_op_library", framework::LoadOpLib);
   m.def("load_op_meta_info_and_register_op",
         framework::LoadOpMetaInfoAndRegisterOp);
   m.def("init_devices", []() { framework::InitDevices(); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
+  m.def("is_compiled_with_npu", IsCompiledWithNPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
   m.def("supports_bfloat16", SupportsBfloat16);
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
+  m.def("op_supported_infos", OpSupportedInfos);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);
   m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) {
@@ -1992,6 +2220,29 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  m.def("get_npu_device_count", platform::GetNPUDeviceCount);
+  m.def("npu_finalize", []() { platform::AclInstance::Instance().Finalize(); });
+
+  py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
+
+  m.def("npu_prof_init", platform::NPUProfilerInit);
+  m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerStart(c.ptr());
+  });
+  m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerStop(c.ptr());
+  });
+  m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
+  m.def("npu_prof_create_config", []() {
+    return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
+  });
+
+  m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerDestroyConfig(c.ptr());
+  });
+#endif
+
   py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
       .value("kDefault", platform::TracerOption::kDefault)
       .value("kOpDetail", platform::TracerOption::kOpDetail)
@@ -2864,8 +3115,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_PSLIB
   BindHeterWrapper(&m);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
-    (defined PADDLE_WITH_PSLIB)
+#ifdef PADDLE_WITH_HETERPS
   BindPSGPUWrapper(&m);
 #endif
   BindGlooWrapper(&m);
@@ -2888,6 +3138,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_ASCEND
   BindAscendWrapper(&m);
   BindAscendGraph(&m);
+  BindAscendDevice(&m);
 #endif
 #ifdef PADDLE_WITH_CRYPTO
   BindCrypto(&m);
@@ -2899,6 +3150,16 @@ All parameter, weight, gradient are variables in Paddle.
   BindCommunicatorContext(&m);
   BindDistCommunicator(&m);
   BindHeterClient(&m);
+  BindGraphPyFeatureNode(&m);
+  BindGraphNode(&m);
+  BindGraphPyService(&m);
+  BindGraphPyServer(&m);
+  BindGraphPyClient(&m);
+  BindIndexNode(&m);
+  BindTreeIndex(&m);
+  BindIndexWrapper(&m);
+  BindIndexSampler(&m);
+
 #endif
 }
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc
index 856c5aac5eb38c7da82a956c5823d1a19be5c8d7..abe1977eb6978ba3adbe7943147dd91a32de24d8 100644
--- a/paddle/fluid/pybind/reader_py.cc
+++ b/paddle/fluid/pybind/reader_py.cc
@@ -223,6 +223,10 @@ class MultiDeviceFeedReader {
     ReadAsync();
   }
 
+  void Shutdown() {
+    for (auto &r : readers_) r->Shutdown();
+  }
+
   ~MultiDeviceFeedReader() {
     queue_->Close();
     pool_.reset();
@@ -266,10 +270,6 @@ class MultiDeviceFeedReader {
     }
   }
 
-  void Shutdown() {
-    for (auto &r : readers_) r->Shutdown();
-  }
-
   void Start() {
     for (auto &r : readers_) r->Start();
   }
@@ -362,6 +362,8 @@ void BindMultiDeviceReader(py::module *module, const char *reader_name) {
            },
            py::call_guard<py::gil_scoped_release>())
       .def("reset", &ReaderType::Reset,
+           py::call_guard<py::gil_scoped_release>())
+      .def("shutdown", &ReaderType::Shutdown,
            py::call_guard<py::gil_scoped_release>());
 }
 
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 5f25217007017cff963ffb6a7a701f0618b44d79..416361d06a996e492118a995a6c0aa28ac38dc1a 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -294,6 +294,22 @@ void SetTensorFromPyArrayT(
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use XPUPlace in CPU/GPU version, "
         "Please recompile or reinstall Paddle with XPU support."));
+#endif
+  } else if (paddle::platform::is_npu_place(place)) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::Place tmp_place = place;
+    platform::NPUDeviceGuard guard(
+        BOOST_GET_CONST(platform::NPUPlace, tmp_place).device);
+    auto dst = self->mutable_data<T>(place);
+    platform::NPUMemcpySync(dst, array.data(), array.nbytes(),
+                            ACL_MEMCPY_HOST_TO_DEVICE);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(place);
+    ctx.Wait();
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use NPUPlace in CPU/GPU/XPU version. "
+        "Please recompile or reinstall Paddle with NPU support."));
 #endif
   } else {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -647,6 +663,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
   }
   bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
   bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
+  bool is_npu_tensor = platform::is_npu_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = tensor.type();
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -665,7 +682,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
 
-  if (!is_gpu_tensor && !is_xpu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) {
     if (!need_deep_copy) {
       auto base = py::cast(std::move(tensor));
       return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
@@ -733,6 +750,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use CUDAPlace in CPU only version, "
         "Please recompile or reinstall Paddle with CUDA support."));
+#endif
+  } else if (is_npu_tensor) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(tensor.place());
+    paddle::memory::Copy(
+        platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
+        copy_bytes,
+        reinterpret_cast<const platform::NPUDeviceContext &>(ctx).stream());
+    ctx.Wait();
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use NPUPlace in CPU/GPU/XPU version, "
+        "Please recompile or reinstall Paddle with NPU support."));
 #endif
   }
   PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
diff --git a/paddle/fluid/string/tinyformat/tinyformat.h b/paddle/fluid/string/tinyformat/tinyformat.h
index a5c1798e1002759a4ea560747453b0e9836b9624..7498c6a46e38af98e8356f9f87a0cfb6b163bddf 100644
--- a/paddle/fluid/string/tinyformat/tinyformat.h
+++ b/paddle/fluid/string/tinyformat/tinyformat.h
@@ -777,7 +777,7 @@ inline void formatImpl(std::ostream &out, const char *fmt,
 
   // Print remaining part of format string.
   fmt = printFormatStringLiteral(out, fmt);
-  if (*fmt != '\0')
+  if (fmt != nullptr && *fmt != '\0' && *fmt != 0)
     TINYFORMAT_ERROR(
         "tinyformat: Too many conversion specifiers in format string");
 
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
deleted file mode 100644
index 0688c63cac3f3fdd9e8b2a215c5384d8e175bd3c..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-function(train_test TARGET_NAME)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (NOT APPLE AND NOT WIN32)
-        cc_test(test_train_${TARGET_NAME}
-                SRCS test_train_${TARGET_NAME}.cc
-                DEPS paddle_inference_shared
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-    else()
-        cc_test(test_train_${TARGET_NAME}
-                SRCS test_train_${TARGET_NAME}.cc
-                DEPS paddle_inference_io
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-    endif()
-    if(TEST test_train_${TARGET_NAME})
-        set_tests_properties(test_train_${TARGET_NAME}
-                PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
-        if(NOT WIN32 AND NOT APPLE)
-            set_tests_properties(test_train_${TARGET_NAME}
-                    PROPERTIES TIMEOUT 150)
-        endif()
-    endif()
-endfunction(train_test)
-
-
-if(WITH_TESTING)
-  train_test(recognize_digits)
-endif()
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
deleted file mode 100644
index 95da77d68d482afd8e25c79d8d1ec5e045bbe738..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(cpp_train_demo CXX C)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
-endif()
-
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
-
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-include_directories("${PADDLE_LIB}/third_party/threadpool")
-include_directories("${PADDLE_LIB}/third_party/dlpack")
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-
-add_executable(demo_trainer demo_trainer.cc)
-
-if(WITH_MKLDNN)
-  add_definitions(-DPADDLE_WITH_MKLDNN)
-  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  if(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
-  else(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-  endif(WIN32)
-endif(WITH_MKLDNN)
-
-if(WITH_MKL)
-  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  if(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
-  else(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
-  endif(WIN32)
-else()
-  if(APPLE)
-    set(MATH_LIB cblas)
-  elseif(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
-  else()
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
-  endif(APPLE)
-endif()
-
-if(APPLE)
-  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-else(APPLE)
-  set(ARCHIVE_START "-Wl,--whole-archive")
-  set(ARCHIVE_END "-Wl,--no-whole-archive")
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-endif(APPLE)
-
-target_link_libraries(demo_trainer
-        ${MACOS_LD_FLAGS}
-        ${ARCHIVE_START}
-        ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference.so
-        ${ARCHIVE_END}
-        ${MATH_LIB}
-        ${MKLDNN_LIB}
-        glog gflags protobuf z xxhash
-        ${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
deleted file mode 100644
index 8a44c25aea9a0d7133ef915815d5e60227bd3e54..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/demo/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-
-### step 1. build paddle lib
-
-```
-
-# WITH_MKL=ON|OFF
-# WITH_MKLDNN=ON|OFF
-
-PADDLE_LIB=/paddle/lib/dir
-cmake .. -DPADDLE_INSTALL_DIR=$PADDLE_LIB \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DWITH_GPU=OFF \
-         -DWITH_STYLE_CHECK=OFF \
-         -DWITH_MKL=OFF \
-         -DWITH_MKLDNN=OFF
-make -j8
-make -j8 fluid_lib_dist
-```
-
-### step 2. generate program desc
-```
-# please install paddle before run this scripe
-pip install --upgrade paddlepaddle-*.whl
-python demo_network.py
-```
-
-This will generate two program desc files:
-  - startup_program: used to init all parameters
-  - main_program: main logic of the network
-
-### step 3. build demo_trainer and run it.
-
-
-```
-# Make a build dir at the same dir of this README.md document.
-# The demo dir can be put anywhere.
-mkdir build
-cd build
-
-# WITH_MKL=ON|OFF
-# WITH_MKLDNN=ON|OFF
-PADDLE_LIB=/paddle/lib/dir
-
-# PADDLE_LIB is the same with PADDLE_INSTALL_DIR when building the lib
-cmake .. -DPADDLE_LIB=$PADDLE_LIB \
-         -DWITH_MKLDNN=OFF \
-         -DWITH_MKL=OFF
-make
-
-# copy startup_program and main_program to this dir
-cp ../startup_program .
-cp ../main_program .
-
-# run demo cpp trainer
-./demo_trainer
-
-```
-
-The output will be:
-```
-step: 0 loss: 1069.02
-step: 1 loss: 1069.02
-step: 2 loss: 1069.02
-....
-```
diff --git a/paddle/fluid/train/demo/demo_network.py b/paddle/fluid/train/demo/demo_network.py
deleted file mode 100644
index 41e98c6a24a750a9300b5c2a6d370303cc0e59c5..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/demo/demo_network.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle.fluid.framework as framework
-
-
-def train_network(with_optimize):
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-
-    if with_optimize:
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.00001)
-        sgd_optimizer.minimize(avg_cost)
-    else:
-        fluid.backward.append_backward(avg_cost)
-
-
-def save_program_desc(network_func):
-    startup_program = framework.Program()
-    train_program = framework.Program()
-
-    with framework.program_guard(train_program, startup_program):
-        network_func(with_optimize=False)
-
-    with open("startup_program", "w") as f:
-        f.write(startup_program.desc.serialize_to_string())
-    with open("main_program", "w") as f:
-        f.write(train_program.desc.serialize_to_string())
-
-
-save_program_desc(train_network)
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
deleted file mode 100644
index 830f00b8db1d5c4bab40710d8fd22ea77768aa9e..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <time.h>
-#include <fstream>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace train {
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE_EQ(
-      fin.is_open(), true,
-      platform::errors::Unavailable("Failed to open file %s.", filename));
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> Load(
-    paddle::framework::Executor* executor, const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-
-  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
-      new paddle::framework::ProgramDesc(program_desc_str));
-  return main_program;
-}
-
-}  // namespace train
-}  // namespace paddle
-
-int main() {
-  paddle::framework::InitDevices();
-
-  const auto cpu_place = paddle::platform::CPUPlace();
-
-  paddle::framework::Executor executor(cpu_place);
-  paddle::framework::Scope scope;
-  auto startup_program = paddle::train::Load(&executor, "startup_program");
-  auto train_program = paddle::train::Load(&executor, "main_program");
-
-  std::string loss_name = "";
-  for (auto op_desc : train_program->Block(0).AllOps()) {
-    if (op_desc->Type() == "mean") {
-      loss_name = op_desc->Output("Out")[0];
-      break;
-    }
-  }
-
-  PADDLE_ENFORCE_NE(loss_name, "",
-                    platform::errors::NotFound("Loss name is not found."));
-
-  // init all parameters
-  executor.Run(*startup_program, &scope, 0);
-
-  // prepare data
-  auto x_var = scope.Var("x");
-  auto x_tensor = x_var->GetMutable<paddle::framework::LoDTensor>();
-  x_tensor->Resize({2, 13});
-
-  auto x_data = x_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 2 * 13; ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  auto y_var = scope.Var("y");
-  auto y_tensor = y_var->GetMutable<paddle::framework::LoDTensor>();
-  y_tensor->Resize({2, 1});
-  auto y_data = y_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 2 * 1; ++i) {
-    y_data[i] = static_cast<float>(i);
-  }
-
-  auto loss_var = scope.Var(loss_name);
-
-  paddle::platform::ProfilerState pf_state;
-  pf_state = paddle::platform::ProfilerState::kCPU;
-  paddle::platform::EnableProfiler(pf_state);
-  clock_t t1 = clock();
-
-  for (int i = 0; i < 10; ++i) {
-    executor.Run(*train_program, &scope, 0, false, true);
-    std::cout << "step: " << i << " loss: "
-              << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
-              << std::endl;
-  }
-
-  clock_t t2 = clock();
-  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
-                                    "run_paddle_op_profiler");
-  std::cout << "run_time = " << t2 - t1 << std::endl;
-  return 0;
-}
diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh
deleted file mode 100755
index 2955e7574daa2d2e41bbade95c3c213917d07d4f..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/demo/run.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-set -x
-
-PADDLE_ROOT=$1
-TURN_ON_MKL=$2 # use MKL or Openblas
-
-# download models
-function download() {
-    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/main_program
-    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/startup_program
-}
-
-download
-
-# build demo trainer
-paddle_install_dir=${PADDLE_ROOT}/build/paddle_install_dir
-
-mkdir -p build
-cd build
-rm -rf *
-cmake .. -DPADDLE_LIB=$paddle_install_dir \
-         -DWITH_MKLDNN=$TURN_ON_MKL \
-         -DWITH_MKL=$TURN_ON_MKL
-make
-
-cd ..
-
-# run demo trainer
-build/demo_trainer
diff --git a/paddle/fluid/train/imdb_demo/CMakeLists.txt b/paddle/fluid/train/imdb_demo/CMakeLists.txt
deleted file mode 100644
index e943d6bc78eab08caa1d513973639ea9dad13b2c..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/CMakeLists.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(cpp_imdb_train_demo CXX C)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
-endif()
-
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
-
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-include_directories("${PADDLE_LIB}/third_party/threadpool")
-include_directories("${PADDLE_LIB}/third_party/dlpack")
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-
-add_executable(demo_trainer save_model.cc demo_trainer.cc)
-
-if(WITH_MKLDNN)
-  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  if(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
-  else(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-  endif(WIN32)
-endif(WITH_MKLDNN)
-
-if(WITH_MKL)
-  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  if(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
-  else(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
-  endif(WIN32)
-else()
-  if(APPLE)
-    set(MATH_LIB cblas)
-  elseif(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
-  else()
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
-  endif(APPLE)
-endif()
-
-if(APPLE)
-  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-else(APPLE)
-  set(ARCHIVE_START "-Wl,--whole-archive")
-  set(ARCHIVE_END "-Wl,--no-whole-archive")
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-endif(APPLE)
-
-target_link_libraries(demo_trainer
-	${MACOS_LD_FLAGS}
-	${ARCHIVE_START}
-	${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference.so
-	${ARCHIVE_END}
-	${MATH_LIB}
-	${MKLDNN_LIB}
-	glog gflags protobuf z xxhash
-	${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/imdb_demo/README.md b/paddle/fluid/train/imdb_demo/README.md
deleted file mode 100644
index 28fd66710f80dda06b1c87266362cb969b42534c..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/README.md
+++ /dev/null
@@ -1,115 +0,0 @@
-# Train with C++ inference API
-
-What is C++ inference API and how to install it:
-
-see: [PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线](https://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/deploy/inference/index_cn.html)
-
-After downloading the source code of Paddle, you can build your own inference lib:
-
-```shell
-PADDLE_ROOT=./Paddle
-cd Paddle
-mkdir build
-cd build
-cmake -DPADDLE_INFERENCE_INSTALL_DIR=$PADDLE_ROOT \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DWITH_PYTHON=OFF \
-      -DWITH_MKL=OFF \
-      -DWITH_GPU=OFF  \
-      -DON_INFER=ON \
-      ..
-make
-make inference_lib_dist
-```
-
-## IMDB task
-
-see: [IMDB Dataset of 50K Movie Reviews | Kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
-
-## Quick Start
-
-### prepare data
-
-```shell
-    wget https://fleet.bj.bcebos.com/text_classification_data.tar.gz
-    tar -zxvf text_classification_data.tar.gz
-```
-### build
-
-```shell
-    mkdir build
-    cd build
-    rm -rf *
-    PADDLE_LIB=path/to/Paddle/build/paddle_install_dir
-    cmake .. -DPADDLE_LIB=$PADDLE_LIB  -DWITH_MKLDNN=OFF -DWITH_MKL=OFF
-    make
-```
-
-### generate program description
-
-```
-    python generate_program.py bow
-```
-
-### run
-
-```shell
-   # After editing train.cfg
-   sh run.sh
-```
-
-## results
-
-Below are training logs on BOW model, the losses go down as expected.
-
-```
-WARNING: Logging before InitGoogleLogging() is written to STDERR
-I0731 22:39:06.974232 10965 demo_trainer.cc:130] Start training...
-I0731 22:39:57.395229 10965 demo_trainer.cc:164] epoch: 0; average loss: 0.405706
-I0731 22:40:50.262344 10965 demo_trainer.cc:164] epoch: 1; average loss: 0.110746
-I0731 22:41:49.731079 10965 demo_trainer.cc:164] epoch: 2; average loss: 0.0475805
-I0731 22:43:31.398355 10965 demo_trainer.cc:164] epoch: 3; average loss: 0.0233249
-I0731 22:44:58.744391 10965 demo_trainer.cc:164] epoch: 4; average loss: 0.00701507
-I0731 22:46:30.451735 10965 demo_trainer.cc:164] epoch: 5; average loss: 0.00258187
-I0731 22:48:14.396687 10965 demo_trainer.cc:164] epoch: 6; average loss: 0.00113157
-I0731 22:49:56.242744 10965 demo_trainer.cc:164] epoch: 7; average loss: 0.000698234
-I0731 22:51:11.585919 10965 demo_trainer.cc:164] epoch: 8; average loss: 0.000510136
-I0731 22:52:50.573947 10965 demo_trainer.cc:164] epoch: 9; average loss: 0.000400932
-I0731 22:54:02.686152 10965 demo_trainer.cc:164] epoch: 10; average loss: 0.000329259
-I0731 22:54:55.233342 10965 demo_trainer.cc:164] epoch: 11; average loss: 0.000278644
-I0731 22:56:15.496256 10965 demo_trainer.cc:164] epoch: 12; average loss: 0.000241055
-I0731 22:57:45.015926 10965 demo_trainer.cc:164] epoch: 13; average loss: 0.000212085
-I0731 22:59:18.419997 10965 demo_trainer.cc:164] epoch: 14; average loss: 0.000189109
-I0731 23:00:15.409077 10965 demo_trainer.cc:164] epoch: 15; average loss: 0.000170465
-I0731 23:01:38.795770 10965 demo_trainer.cc:164] epoch: 16; average loss: 0.000155051
-I0731 23:02:57.289487 10965 demo_trainer.cc:164] epoch: 17; average loss: 0.000142106
-I0731 23:03:48.032507 10965 demo_trainer.cc:164] epoch: 18; average loss: 0.000131089
-I0731 23:04:51.195230 10965 demo_trainer.cc:164] epoch: 19; average loss: 0.000121605
-I0731 23:06:27.008040 10965 demo_trainer.cc:164] epoch: 20; average loss: 0.00011336
-I0731 23:07:56.568284 10965 demo_trainer.cc:164] epoch: 21; average loss: 0.000106129
-I0731 23:09:23.948290 10965 demo_trainer.cc:164] epoch: 22; average loss: 9.97393e-05
-I0731 23:10:56.062590 10965 demo_trainer.cc:164] epoch: 23; average loss: 9.40532e-05
-I0731 23:12:23.014047 10965 demo_trainer.cc:164] epoch: 24; average loss: 8.89622e-05
-I0731 23:13:21.439818 10965 demo_trainer.cc:164] epoch: 25; average loss: 8.43784e-05
-I0731 23:14:56.171597 10965 demo_trainer.cc:164] epoch: 26; average loss: 8.02322e-05
-I0731 23:16:01.513542 10965 demo_trainer.cc:164] epoch: 27; average loss: 7.64629e-05
-I0731 23:17:18.709139 10965 demo_trainer.cc:164] epoch: 28; average loss: 7.30239e-05
-I0731 23:18:41.421555 10965 demo_trainer.cc:164] epoch: 29; average loss: 6.98716e-05
-```
-
-I trained a Bow model and a CNN model on IMDB dataset using the trainer. At the same time, I also trained the same models using traditional Python training methods. 
-Results show that the two methods achieve almost the same dev accuracy:
-
-CNN:
- 
-<img src="https://user-images.githubusercontent.com/23031310/62356234-32217300-b543-11e9-89fd-a07614904a08.png" width="300">
-
-BOW:
-
-<img src="https://user-images.githubusercontent.com/23031310/62356253-39488100-b543-11e9-9fa2-a399fc1119d6.png" width="300">
-
-I also recorded the training speed of the C++ Trainer and the python training methods, C++ trainer is quicker on CNN model: 
-
-<img src="https://user-images.githubusercontent.com/23031310/62356444-af4ce800-b543-11e9-88c8-f3bde1321ea1.png" width="300">
-
-#TODO (mapingshuo): find the reason why C++ trainer is quicker on CNN model than python method.
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
deleted file mode 100644
index 6d3b8e7ca4a840f9c200662bae37e44859fcc0c8..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <time.h>
-#include <fstream>
-
-#include "include/save_model.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/dataset_factory.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-#include "gflags/gflags.h"
-
-DEFINE_string(filelist, "train_filelist.txt", "filelist for fluid dataset");
-DEFINE_string(data_proto_desc, "data.proto", "data feed protobuf description");
-DEFINE_string(startup_program_file, "startup_program",
-              "startup program description");
-DEFINE_string(main_program_file, "", "main program description");
-DEFINE_string(loss_name, "mean_0.tmp_0",
-              "loss tensor name in the main program");
-DEFINE_string(save_dir, "cnn_model", "directory to save trained models");
-DEFINE_int32(epoch_num, 30, "number of epochs to run when training");
-
-namespace paddle {
-namespace train {
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE_EQ(
-      fin.is_open(), true,
-      platform::errors::Unavailable("Failed to open file %s.", filename));
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> LoadProgramDesc(
-    const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
-      new paddle::framework::ProgramDesc(program_desc_str));
-  return main_program;
-}
-
-bool IsPersistable(const paddle::framework::VarDesc* var) {
-  if (var->Persistable() &&
-      var->GetType() != paddle::framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != paddle::framework::proto::VarType::FETCH_LIST &&
-      var->GetType() != paddle::framework::proto::VarType::RAW) {
-    return true;
-  }
-  return false;
-}
-
-}  // namespace train
-}  // namespace paddle
-
-int main(int argc, char* argv[]) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
-
-  std::cerr << "filelist: " << FLAGS_filelist << std::endl;
-  std::cerr << "data_proto_desc: " << FLAGS_data_proto_desc << std::endl;
-  std::cerr << "startup_program_file: " << FLAGS_startup_program_file
-            << std::endl;
-  std::cerr << "main_program_file: " << FLAGS_main_program_file << std::endl;
-  std::cerr << "loss_name: " << FLAGS_loss_name << std::endl;
-  std::cerr << "save_dir: " << FLAGS_save_dir << std::endl;
-  std::cerr << "epoch_num: " << FLAGS_epoch_num << std::endl;
-
-  std::string filelist = std::string(FLAGS_filelist);
-  std::vector<std::string> file_vec;
-  std::ifstream fin(filelist);
-  if (fin) {
-    std::string filename;
-    while (fin >> filename) {
-      file_vec.push_back(filename);
-    }
-  }
-  PADDLE_ENFORCE_GE(
-      file_vec.size(), 1,
-      platform::errors::InvalidArgument(
-          "At least one file to train, but received number of file is %d.",
-          file_vec.size()));
-  paddle::framework::InitDevices();
-  const auto cpu_place = paddle::platform::CPUPlace();
-  paddle::framework::Executor executor(cpu_place);
-  paddle::framework::Scope scope;
-  auto startup_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_startup_program_file));
-  auto main_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_main_program_file));
-
-  executor.Run(*startup_program, &scope, 0);
-
-  std::string data_feed_desc_str;
-  paddle::train::ReadBinaryFile(std::string(FLAGS_data_proto_desc),
-                                &data_feed_desc_str);
-  VLOG(3) << "load data feed desc done.";
-  std::unique_ptr<paddle::framework::Dataset> dataset_ptr;
-  dataset_ptr =
-      paddle::framework::DatasetFactory::CreateDataset("MultiSlotDataset");
-  VLOG(3) << "initialize dataset ptr done";
-
-  // find all params
-  std::vector<std::string> param_names;
-  const paddle::framework::BlockDesc& global_block = main_program->Block(0);
-  for (auto* var : global_block.AllVars()) {
-    if (paddle::train::IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
-      param_names.push_back(var->Name());
-    }
-  }
-
-  int epoch_num = FLAGS_epoch_num;
-  std::string loss_name = FLAGS_loss_name;
-  auto loss_var = scope.Var(loss_name);
-
-  LOG(INFO) << "Start training...";
-
-  for (int epoch = 0; epoch < epoch_num; ++epoch) {
-    VLOG(3) << "Epoch:" << epoch;
-    // get reader
-    dataset_ptr->SetFileList(file_vec);
-    VLOG(3) << "set file list done";
-    dataset_ptr->SetThreadNum(1);
-    VLOG(3) << "set thread num done";
-    dataset_ptr->SetDataFeedDesc(data_feed_desc_str);
-    VLOG(3) << "set data feed desc done";
-    dataset_ptr->CreateReaders();
-    const std::vector<paddle::framework::DataFeed*> readers =
-        dataset_ptr->GetReaders();
-    PADDLE_ENFORCE_EQ(readers.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Readers num(%d) should be equal to thread num(1).",
-                          readers.size()));
-    readers[0]->SetPlace(paddle::platform::CPUPlace());
-    const std::vector<std::string>& input_feed_names =
-        readers[0]->GetUseSlotAlias();
-    for (auto name : input_feed_names) {
-      readers[0]->AddFeedVar(scope.Var(name), name);
-    }
-    VLOG(3) << "get reader done";
-    readers[0]->Start();
-    VLOG(3) << "start a reader";
-    VLOG(3) << "readers size: " << readers.size();
-
-    int step = 0;
-    std::vector<float> loss_vec;
-
-    while (readers[0]->Next() > 0) {
-      executor.Run(*main_program, &scope, 0, false, true);
-      loss_vec.push_back(
-          loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]);
-    }
-    float average_loss =
-        accumulate(loss_vec.begin(), loss_vec.end(), 0.0) / loss_vec.size();
-
-    LOG(INFO) << "epoch: " << epoch << "; average loss: " << average_loss;
-    dataset_ptr->DestroyReaders();
-
-    // save model
-    std::string save_dir_root = FLAGS_save_dir;
-    std::string save_dir =
-        save_dir_root + "/epoch" + std::to_string(epoch) + ".model";
-    paddle::framework::save_model(main_program, &scope, param_names, save_dir,
-                                  false);
-  }
-}
diff --git a/paddle/fluid/train/imdb_demo/generate_program.py b/paddle/fluid/train/imdb_demo/generate_program.py
deleted file mode 100644
index a12282d94ddf9ed3e0824c9af709bd1f5b82556f..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/generate_program.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import paddle
-import logging
-import paddle.fluid as fluid
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-
-def load_vocab(filename):
-    vocab = {}
-    with open(filename) as f:
-        wid = 0
-        for line in f:
-            vocab[line.strip()] = wid
-            wid += 1
-    vocab["<unk>"] = len(vocab)
-    return vocab
-
-
-if __name__ == "__main__":
-    vocab = load_vocab('imdb.vocab')
-    dict_dim = len(vocab)
-    model_name = sys.argv[1]
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    dataset = fluid.DatasetFactory().create_dataset()
-    dataset.set_batch_size(128)
-    dataset.set_pipe_command("python imdb_reader.py")
-
-    dataset.set_use_var([data, label])
-    desc = dataset.proto_desc
-
-    with open("data.proto", "w") as f:
-        f.write(dataset.desc())
-
-    from nets import *
-    if model_name == 'cnn':
-        logger.info("Generate program description of CNN net")
-        avg_cost, acc, prediction = cnn_net(data, label, dict_dim)
-    elif model_name == 'bow':
-        logger.info("Generate program description of BOW net")
-        avg_cost, acc, prediction = bow_net(data, label, dict_dim)
-    else:
-        logger.error("no such model: " + model_name)
-        exit(0)
-    # optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
-    optimizer.minimize(avg_cost)
-
-    with open(model_name + "_main_program", "wb") as f:
-        f.write(fluid.default_main_program().desc.serialize_to_string())
-
-    with open(model_name + "_startup_program", "wb") as f:
-        f.write(fluid.default_startup_program().desc.serialize_to_string())
diff --git a/paddle/fluid/train/imdb_demo/imdb_reader.py b/paddle/fluid/train/imdb_demo/imdb_reader.py
deleted file mode 100644
index f197c95ec32171fb075bb9deeacd6fc6ae3b16e8..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/imdb_reader.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-import paddle
-import re
-import paddle.fluid.incubate.data_generator as dg
-
-
-class IMDBDataset(dg.MultiSlotDataGenerator):
-    def load_resource(self, dictfile):
-        self._vocab = {}
-        wid = 0
-        with open(dictfile) as f:
-            for line in f:
-                self._vocab[line.strip()] = wid
-                wid += 1
-        self._unk_id = len(self._vocab)
-        self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
-        self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0])
-
-    def get_words_and_label(self, line):
-        send = '|'.join(line.split('|')[:-1]).lower().replace("<br />",
-                                                              " ").strip()
-        label = [int(line.split('|')[-1])]
-
-        words = [x for x in self._pattern.split(send) if x and x != " "]
-        feas = [
-            self._vocab[x] if x in self._vocab else self._unk_id for x in words
-        ]
-        return feas, label
-
-    def infer_reader(self, infer_filelist, batch, buf_size):
-        def local_iter():
-            for fname in infer_filelist:
-                with open(fname, "r") as fin:
-                    for line in fin:
-                        feas, label = self.get_words_and_label(line)
-                        yield feas, label
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    def generate_sample(self, line):
-        def memory_iter():
-            for i in range(1000):
-                yield self.return_value
-
-        def data_iter():
-            feas, label = self.get_words_and_label(line)
-            yield ("words", feas), ("label", label)
-
-        return data_iter
-
-
-if __name__ == "__main__":
-    imdb = IMDBDataset()
-    imdb.load_resource("imdb.vocab")
-    imdb.run_from_stdin()
diff --git a/paddle/fluid/train/imdb_demo/include/save_model.h b/paddle/fluid/train/imdb_demo/include/save_model.h
deleted file mode 100644
index 452052866855d294676a0792e06df7a4b6ecd76f..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/include/save_model.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-void save_model(const std::unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine);
-}
-}
diff --git a/paddle/fluid/train/imdb_demo/nets.py b/paddle/fluid/train/imdb_demo/nets.py
deleted file mode 100644
index a25e67e3b5d56d1e672915cfade1a24ff6546eeb..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/nets.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import time
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-
-
-def bow_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    bow net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def cnn_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            win_size=3):
-    """
-    conv net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=win_size,
-        act="tanh",
-        pool_type="max")
-
-    fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
-
-    prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def lstm_net(data,
-             label,
-             dict_dim,
-             emb_dim=128,
-             hid_dim=128,
-             hid_dim2=96,
-             class_dim=2,
-             emb_lr=30.0):
-    """
-    lstm net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr),
-        is_sparse=True)
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
-
-    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
-
-    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
-
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def gru_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            emb_lr=400.0):
-    """
-    gru net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
-    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
-    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
-    gru_max_tanh = fluid.layers.tanh(gru_max)
-    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
diff --git a/paddle/fluid/train/imdb_demo/run.sh b/paddle/fluid/train/imdb_demo/run.sh
deleted file mode 100644
index f71b4bac602a9e6d5c7bea03f3c56043b13547d3..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-set -exu
-build/demo_trainer --flagfile="train.cfg"
diff --git a/paddle/fluid/train/imdb_demo/save_model.cc b/paddle/fluid/train/imdb_demo/save_model.cc
deleted file mode 100644
index 49da550dbb7f52912406663df6cf11e21e193bd9..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/save_model.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "include/save_model.h"
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-using std::unique_ptr;
-
-namespace paddle {
-namespace framework {
-void save_model(const unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine) {
-  auto place = platform::CPUPlace();
-  const BlockDesc& global_block = main_program->Block(0);
-  std::vector<std::string> paralist;
-  for (auto* var : global_block.AllVars()) {
-    bool is_model_param = false;
-    for (auto param_name : param_names) {
-      if (var->Name() == param_name) {
-        is_model_param = true;
-        break;
-      }
-    }
-
-    if (!is_model_param) continue;
-
-    if (!save_combine) {
-      VLOG(3) << "model var name: %s" << var->Name().c_str();
-
-      paddle::framework::AttributeMap attrs;
-      attrs.insert({"file_path", model_name + "/" + var->Name()});
-      auto save_op = paddle::framework::OpRegistry::CreateOp(
-          "save", {{"X", {var->Name()}}}, {}, attrs);
-
-      save_op->Run(*scope, place);
-    } else {
-      paralist.push_back(var->Name());
-    }
-  }
-  if (save_combine) {
-    std::sort(paralist.begin(), paralist.end());
-    paddle::framework::AttributeMap attrs;
-    attrs.insert({"file_path", model_name});
-    auto save_op = paddle::framework::OpRegistry::CreateOp(
-        "save_combine", {{"X", paralist}}, {}, attrs);
-    save_op->Run(*scope, place);
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/train/imdb_demo/train.cfg b/paddle/fluid/train/imdb_demo/train.cfg
deleted file mode 100644
index 1821498890be8c17ff749bee5a9a0be3f2138810..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/train.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
---filelist=train_filelist.txt
---data_proto_desc=data.proto
---loss_name=mean_0.tmp_0
---startup_program_file=bow_startup_program
---main_program_file=bow_main_program
---save_dir=bow_model
---epoch_num=30
diff --git a/paddle/fluid/train/imdb_demo/train_filelist.txt b/paddle/fluid/train/imdb_demo/train_filelist.txt
deleted file mode 100644
index dcf088af4176196a503097b7d4e16960bbe5ae10..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/imdb_demo/train_filelist.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-train_data/part-0
-train_data/part-1
-train_data/part-10
-train_data/part-11
-train_data/part-2
-train_data/part-3
-train_data/part-4
-train_data/part-5
-train_data/part-6
-train_data/part-7
-train_data/part-8
-train_data/part-9
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
deleted file mode 100644
index 7a980cbac8b95f3aa887e50030b0fd9c12c59e2a..0000000000000000000000000000000000000000
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <time.h>
-#include <fstream>
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-
-DEFINE_string(dirname, "", "Directory of the train model.");
-
-namespace paddle {
-
-void Train(std::string model_dir) {
-  framework::InitDevices();
-  const auto cpu_place = platform::CPUPlace();
-  framework::Executor executor(cpu_place);
-  framework::Scope scope;
-
-  auto train_program = inference::Load(
-      &executor, &scope, model_dir + "__model_combined__.main_program",
-      model_dir + "__params_combined__");
-
-  std::string loss_name = "";
-  for (auto op_desc : train_program->Block(0).AllOps()) {
-    if (op_desc->Type() == "mean") {
-      loss_name = op_desc->Output("Out")[0];
-      break;
-    }
-  }
-
-  PADDLE_ENFORCE_NE(loss_name, "",
-                    platform::errors::NotFound("Loss name is not found."));
-
-  // prepare data
-  auto x_var = scope.Var("img");
-  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
-  x_tensor->Resize({64, 1, 28, 28});
-
-  auto x_data = x_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 64 * 28 * 28; ++i) {
-    x_data[i] = 1.0;
-  }
-
-  auto y_var = scope.Var("label");
-  auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
-  y_tensor->Resize({64, 1});
-  auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
-  for (int i = 0; i < 64 * 1; ++i) {
-    y_data[i] = static_cast<int64_t>(1);
-  }
-
-  auto loss_var = scope.Var(loss_name);
-  float first_loss = 0.0;
-  float last_loss = 0.0;
-  for (int i = 0; i < 100; ++i) {
-    executor.Run(*train_program, &scope, 0, false, true,
-                 {loss_name, "img", "label"});
-    if (i == 0) {
-      first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
-    } else if (i == 99) {
-      last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
-    }
-  }
-  EXPECT_LT(last_loss, first_loss);
-}
-
-TEST(train, recognize_digits) {
-  CHECK(!FLAGS_dirname.empty());
-  Train(FLAGS_dirname + "recognize_digits_mlp.train.model/");
-  Train(FLAGS_dirname + "recognize_digits_conv.train.model/");
-}
-
-}  // namespace paddle
diff --git a/paddle/scripts/build_docker_images.sh b/paddle/scripts/build_docker_images.sh
index a90f0885294a9cfb9f65c3cc993cd77025a9dc4a..2b584cdca6b4ce121b8b25e037ae4601d035c85c 100644
--- a/paddle/scripts/build_docker_images.sh
+++ b/paddle/scripts/build_docker_images.sh
@@ -1,4 +1,19 @@
 #!/bin/sh
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -xe
 
 REPO="${REPO:-paddlepaddle}"
diff --git a/paddle/scripts/docker/root/.scripts/git-completion.sh b/paddle/scripts/docker/root/.scripts/git-completion.sh
index bdddef5ac2faf50b47dd03539dae8912bec8a16c..c43e88a4acd73ac905e371c940913f3dd65d9244 100755
--- a/paddle/scripts/docker/root/.scripts/git-completion.sh
+++ b/paddle/scripts/docker/root/.scripts/git-completion.sh
@@ -1,4 +1,19 @@
 #!bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 # bash/zsh completion support for core Git.
 #
diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh
index 1034b1c5c10435ac8a0947b1d1f9c47f7a8ede9a..cacec55d3bc228bb8d3c08c642719b5c0fefb772 100644
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 ## purple to echo
 function purple(){
     echo -e "\033[35m$1\033[0m"
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 07de8ff6c2f7e3e2293435a846e59a9240a187c8..e53828ff10be602dc2b1adc04512aab947fdec9c 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,21 +26,42 @@ set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
-taskkill /f /im op_function_generator.exe
-wmic process where name="op_function_generator.exe" call terminate
+taskkill /f /im op_function_generator.exe  2>NUL
+taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im MSBuild.exe 2>NUL
+taskkill /f /im CL.exe 2>NUL
+taskkill /f /im Lib.exe 2>NUL
+taskkill /f /im link.exe 2>NUL
+taskkill /f /im vctip.exe 2>NUL
+taskkill /f /im cvtres.exe 2>NUL
+taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
 taskkill /f /im python.exe  2>NUL
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="python.exe" call terminate 2>NUL
 
 rem ------initialize common variable------
-if not defined GENERATOR set GENERATOR="Visual Studio 14 2015 Win64"
+if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
 if not defined BRANCH set BRANCH=develop
-if not defined WITH_TENSORRT set WITH_TENSORRT=ON 
+if not defined WITH_TENSORRT set WITH_TENSORRT=ON
 if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto
 if not defined WITH_GPU set WITH_GPU=ON
 if not defined WITH_MKL set WITH_MKL=ON
 if not defined WITH_AVX set WITH_AVX=ON
 if not defined WITH_TESTING set WITH_TESTING=ON
-if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=OFF
+if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
@@ -51,6 +72,10 @@ if not defined WITH_CACHE set WITH_CACHE=OFF
 if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF
 if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
 if not defined LOG_LEVEL set LOG_LEVEL=normal
+if not defined PRECISION_TEST set PRECISION_TEST=OFF
+if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
+if not defined retry_times set retry_times=2
+if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 
 rem -------set cache build directory-----------
 rmdir build\python /s/q
@@ -59,9 +84,6 @@ rmdir build\paddle_inference_install_dir /s/q
 rmdir build\paddle_inference_c_install_dir /s/q
 del build\CMakeCache.txt
 
-: set CI_SKIP_CPP_TEST if only *.py changed
-git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
-
 if "%WITH_CACHE%"=="OFF" (
     rmdir build /s/q
     goto :mkbuild
@@ -80,7 +102,7 @@ git show-ref --verify --quiet refs/heads/last_pr
 if %ERRORLEVEL% EQU 0 (
     git diff HEAD last_pr --stat --name-only
     git diff HEAD last_pr --stat --name-only | findstr "setup.py.in"
-    if %ERRORLEVEL% EQU 0 (
+    if !ERRORLEVEL! EQU 0 (
         rmdir build /s/q
     )
     git branch -D last_pr
@@ -111,70 +133,21 @@ dir .
 dir %cache_dir%
 dir paddle\fluid\pybind\Release
 
-rem ------initialize the python environment------
-if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
-set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
-set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
-
-rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled
-rem Now use system python environment temporarily
-rem %PYTHON_EXECUTABLE% -m pip install virtualenv
-rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci
-rem call paddle_winci\Scripts\activate.bat
-
-rem ------pre install python requirement----------
-where python
-where pip
-pip install wheel --user
-pip install -r %work_dir%\python\unittest_py\requirements.txt --user
-pip install -r %work_dir%\python\requirements.txt --user
-
-if %ERRORLEVEL% NEQ 0 (
-    echo pip install requirements.txt failed!
-    exit /b 7
-)
-
-rem ------pre install clcache and init config----------
-rem pip install clcache --user
-pip uninstall -y clcache
-:: set USE_CLCACHE to enable clcache
-rem set USE_CLCACHE=1
-:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-rem set CLCACHE_HARDLINK=1
-:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
-:: set maximum cache size to 20G
-rem clcache.exe -M 21474836480
-
-:: install ninja if GENERATOR is Ninja
-if %GENERATOR% == "Ninja" (
-    pip install ninja
-    if %errorlevel% NEQ 0 (
-        echo pip install ninja failed!
-        exit /b 7
-    )
-)
-
-rem ------show summary of current environment----------
-cmake --version
-nvcc --version
-where nvidia-smi
-nvidia-smi
-python %work_dir%\tools\summary_env.py
-%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
-
 goto :CASE_%1
 
 echo "Usage: paddle_build.bat [OPTION]"
 echo "OPTION:"
 echo "wincheck_mkl: run Windows MKL/GPU/UnitTest CI tasks on Windows"
 echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
+echo "build_avx_whl: build Windows avx whl package on Windows"
+echo "build_no_avx_whl: build Windows no avx whl package on Windows"
 exit /b 1
 
 rem ------PR CI windows check for MKL/GPU----------
 :CASE_wincheck_mkl
 set WITH_MKL=ON
 set WITH_GPU=ON
+set WITH_AVX=ON
 set MSVC_STATIC_CRT=OFF
 
 call :cmake || goto cmake_error
@@ -187,9 +160,11 @@ goto:success
 
 rem ------PR CI windows check for OPENBLAS/CPU------
 :CASE_wincheck_openblas
-set WITH_MKL=ON
+set WITH_MKL=OFF
 set WITH_GPU=OFF
+set WITH_AVX=OFF
 set MSVC_STATIC_CRT=ON
+set retry_times=1
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -204,6 +179,7 @@ rem ------Build windows avx whl package------
 set WITH_AVX=ON
 set ON_INFER=OFF
 set CUDA_ARCH_NAME=All
+set retry_times=4
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -215,6 +191,7 @@ rem ------Build windows no-avx whl package------
 set WITH_AVX=OFF
 set ON_INFER=OFF
 set CUDA_ARCH_NAME=All
+set retry_times=4
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -235,23 +212,69 @@ rem "Other configurations are added here"
 rem :CASE_wincheck_others
 rem call ...
 
+
 rem ---------------------------------------------------------------------------------------------
 :cmake
+@ECHO OFF
 echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
 
-call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
+rem Configure the environment for 64-bit builds. 'DISTUTILS_USE_SDK' indicates that the user has selected the compiler.
+call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
+set DISTUTILS_USE_SDK=1
+rem Windows 10 Kit bin dir
+set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH%
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
 
-@ECHO ON
-if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
 set PATH=%TENSORRT_ROOT:/=\%\lib;%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH%
 
-rem ------set third_party cache dir------
+rem install ninja if GENERATOR is Ninja
+if %GENERATOR% == "Ninja" (
+    pip install ninja
+    if %errorlevel% NEQ 0 (
+        echo pip install ninja failed!
+        exit /b 7
+    )
+)
+
+rem ------show summary of current GPU environment----------
+cmake --version
+if "%WITH_GPU%"=="ON" (
+    nvcc --version
+    nvidia-smi
+)
 
+rem ------initialize the python environment------
+set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
+set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
+if %WITH_PYTHON% == "OFF" (
+    where python
+    where pip
+    pip install wheel --user
+    pip install -r %work_dir%\python\requirements.txt --user
+    if %ERRORLEVEL% NEQ 0 (
+        echo pip install requirements.txt failed!
+        exit /b 7
+    )
+)
+
+rem ------pre install clcache and init config----------
+rem pip install clcache --user
+pip uninstall -y clcache
+:: set USE_CLCACHE to enable clcache
+rem set USE_CLCACHE=1
+:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
+rem set CLCACHE_HARDLINK=1
+:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
+rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+:: set maximum cache size to 20G
+rem clcache.exe -M 21474836480
+
+rem ------set third_party cache dir------
 : clear third party cache every once in a while
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
 set day_now=%datetime:~6,2%
@@ -261,16 +284,16 @@ if %day_now% NEQ %day_before% (
     echo %day_now% > %cache_dir%\day.txt
     type %cache_dir%\day.txt
     if %day_now% EQU 21 (
-        rmdir %cache_dir%\third_party_GPU/ /s/q
-        rmdir %cache_dir%\third_party/ /s/q
+        rmdir %cache_dir%\third_party_GPU /s/q
+        rmdir %cache_dir%\third_party /s/q
     )
     if %day_now% EQU 11 (
-        rmdir %cache_dir%\third_party_GPU/ /s/q
-        rmdir %cache_dir%\third_party/ /s/q
+        rmdir %cache_dir%\third_party_GPU /s/q
+        rmdir %cache_dir%\third_party /s/q
     )
     if %day_now% EQU 01 (
-        rmdir %cache_dir%\third_party_GPU/ /s/q
-        rmdir %cache_dir%\third_party/ /s/q
+        rmdir %cache_dir%\third_party_GPU /s/q
+        rmdir %cache_dir%\third_party /s/q
     )
 )
 
@@ -294,14 +317,14 @@ if "%WITH_GPU%"=="ON" (
 )
 
 :cmake_impl
-echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+echo cmake .. -G %GENERATOR% -T host=x64 -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
 -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
 -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME%
 
-cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
@@ -322,18 +345,20 @@ echo    ========================================
 echo    Step 2. Buile Paddle ...
 echo    ========================================
 
-for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*2/3
+for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*4/5
+echo "PARALLEL PROJECT COUNT is %PARALLEL_PROJECT_COUNT%"
 set build_times=1
 :build_tp
 echo Build third_party the %build_times% time:
+
 if %GENERATOR% == "Ninja" (
     ninja third_party
 ) else (
-    msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj
+    MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:quiet third_party.vcxproj
 )
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
-    if %build_times% GTR 2 (
+    if %build_times% GTR %retry_times% (
         exit /b 7
     ) else (
         echo Build third_party failed, will retry!
@@ -347,20 +372,42 @@ set build_times=1
 :: reset clcache zero stats for collect PR's actual hit rate
 rem clcache.exe -z
 
+rem -------clean up environment again-----------
+taskkill /f /im MSBuild.exe 2>NUL
+taskkill /f /im cl.exe 2>NUL
+taskkill /f /im lib.exe 2>NUL
+taskkill /f /im link.exe 2>NUL
+taskkill /f /im vctip.exe 2>NUL
+taskkill /f /im cvtres.exe 2>NUL
+taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
+
 echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
     ninja -j %PARALLEL_PROJECT_COUNT%
 ) else (
     if "%WITH_CLCACHE%"=="OFF" (
-        msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     ) else (
-        msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     )
 )
 
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
-    if %build_times% GTR 1 (
+    if %build_times% GTR %retry_times% (
         exit /b 7
     ) else (
         echo Build Paddle failed, will retry!
@@ -441,6 +488,16 @@ echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
 
+
+: set CI_SKIP_CPP_TEST if only *.py changed
+git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
+
+pip install -r %work_dir%\python\unittest_py\requirements.txt --user
+if %ERRORLEVEL% NEQ 0 (
+    echo pip install unittest requirements.txt failed!
+    exit /b 7
+)
+
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
 
@@ -453,27 +510,11 @@ dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin
 dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin
 
 pip install requests
-python %work_dir%\tools\get_quick_disable_lt.py > Output
-if %errorlevel%==0 (
-    set /p disable_ut_quickly=<Output
-    DEL Output
-    ) else (
-    set disable_ut_quickly=''
-)
 
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
 %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
 
-if "%NIGHTLY_MODE%"=="ON" (
-    set nightly_label="()"
-    ) else (
-    set nightly_label="(RUN_TYPE=NIGHTLY^|RUN_TYPE=DIST:NIGHTLY^|RUN_TYPE=EXCLUSIVE:NIGHTLY)"
-    echo    ========================================
-    echo    "Unittests with nightly labels  are only run at night"
-    echo    ========================================
-)
-
 if "%WITH_GPU%"=="ON" (
     goto:parallel_test_base_gpu
 ) else (
@@ -493,7 +534,15 @@ setlocal enabledelayedexpansion
 :: for /F %%# in ('cmd /C nvidia-smi -L ^|find "GPU" /C') do set CUDA_DEVICE_COUNT=%%#
 set CUDA_DEVICE_COUNT=1
 
-%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST%
+echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
+-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
+-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% >> %work_dir%\win_cmake.sh
+set FLAGS_fraction_of_gpu_memory_to_use=0.92
+
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU%
 
 goto:eof
 
@@ -502,7 +551,7 @@ echo    ========================================
 echo    Running CPU unit tests in parallel way ...
 echo    ========================================
 
-ctest.exe -E "(%disable_ut_quickly%)" -LE %nightly_label% --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU%
 
 goto:eof
 
@@ -579,7 +628,7 @@ echo     git fetch upstream $BRANCH # develop is not fetched>>  check_change_of_
 echo fi>>  check_change_of_unittest.sh
 echo git checkout -b origin_pr >>  check_change_of_unittest.sh
 echo git checkout -f $BRANCH >>  check_change_of_unittest.sh
-echo cmake .. -G %GENERATOR% -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+echo cmake .. -G %GENERATOR% -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
 -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^
 -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
 -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
@@ -696,7 +745,7 @@ echo    ========================================
 echo    Clean up environment  at the end ...
 echo    ========================================
 taskkill /f /im cmake.exe  2>NUL
-taskkill /f /im msbuild.exe 2>NUL
+taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im git.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
 taskkill /f /im lib.exe 2>NUL
@@ -705,9 +754,21 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
+taskkill /f /im python.exe  2>NUL
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
-taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 22ba30c5c8d5d3c75109ae98794e234dd3089d5f..e0aec2ba50bd41cee81a65446221437c37442217 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -145,6 +145,18 @@ function cmake_base() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp39-cp39" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.9" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.9/include/python3.9/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/libpython3.9.dylib"
+                pip3.9 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
         fi
     else
         if [ "$1" != "" ]; then
@@ -205,6 +217,20 @@ function cmake_base() {
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
                 pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp39-cp39" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.9.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.9.0/bin/python3.9
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.9.0/include/python3.9
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.9.0/lib/libpython3.so"
+                pip3.9 install -r ${PADDLE_ROOT}/python/requirements.txt
+           elif [ "$1" == "conda-python3.7" ]; then
+                export LD_LIBRARY_PATH=/opt/conda/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/conda/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/conda/bin/python
+                                     -DPYTHON_INCLUDE_DIR:PATH=/opt/conda/include/python3.7m
+                                     -DPYTHON_LIBRARIES:FILEPATH=/opt/conda/lib/libpython3.so"
+                /opt/conda/bin/pip install -r ${PADDLE_ROOT}/python/requirements.txt
            fi
         else
             pip install -r ${PADDLE_ROOT}/python/requirements.txt
@@ -220,9 +246,14 @@ function cmake_base() {
     fi
 
     distibuted_flag=${WITH_DISTRIBUTE:-OFF}
-    grpc_flag="OFF"
     gloo_flag=${distibuted_flag}
 
+    if [ "$CMD" != "assert_file_approvals" ];then
+      python -m pip install distro
+      python ${PADDLE_ROOT}/tools/summary_env.py
+      bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
+    fi
+
     cat <<EOF
     ========================================
     Configuring cmake in /paddle/build ...
@@ -230,7 +261,7 @@ function cmake_base() {
         ${PYTHON_FLAGS}
         -DWITH_GPU=${WITH_GPU:-OFF}
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON}
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
+        -DWITH_ROCM=${WITH_ROCM:-OFF}
         -DWITH_DISTRIBUTE=${distibuted_flag}
         -DWITH_MKL=${WITH_MKL:-ON}
         -DWITH_AVX=${WITH_AVX:-OFF}
@@ -242,13 +273,11 @@ function cmake_base() {
         -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF}
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-        -DWITH_GRPC=${grpc_flag}
         -DWITH_PSCORE=${distibuted_flag}
         -DWITH_GLOO=${gloo_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
@@ -256,6 +285,7 @@ function cmake_base() {
         -DLITE_GIT_TAG=release/v2.8
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
+        -DWITH_STRIP=${WITH_STRIP:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -267,7 +297,7 @@ EOF
         ${PYTHON_FLAGS} \
         -DWITH_GPU=${WITH_GPU:-OFF} \
         -DWITH_TENSORRT=${WITH_TENSORRT:-ON} \
-        -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
+        -DWITH_ROCM=${WITH_ROCM:-OFF} \
         -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
@@ -285,7 +315,6 @@ EOF
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_GRPC=${grpc_flag} \
         -DWITH_PSCORE=${distibuted_flag} \
         -DWITH_GLOO=${gloo_flag} \
         -DLITE_GIT_TAG=release/v2.8 \
@@ -293,6 +322,7 @@ EOF
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} \
+        -DWITH_STRIP=${WITH_STRIP:-ON} \
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
@@ -395,7 +425,7 @@ EOF
         tar -czf paddle_inference.tgz paddle_inference
         buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}')
         echo "Paddle_Inference Size: $buildSize"
-        echo "ipipe_log_param_Paddle_Inference_Size: $buildSize"
+        echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         SYSTEM=`uname -s`
         if [ "$SYSTEM" == "Darwin" ]; then
@@ -405,10 +435,10 @@ EOF
         fi
         buildSize=$($com ${PADDLE_ROOT}/build |awk '{print $1}')
         echo "Build Size: $buildSize"
-        echo "ipipe_log_param_Build_Size: $buildSize"
+        echo "ipipe_log_param_Build_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
         PR_whlSize=$($com ${PADDLE_ROOT}/build/python/dist |awk '{print $1}')
         echo "PR whl Size: $PR_whlSize"
-        echo "ipipe_log_param_PR_whl_Size: $PR_whlSize"
+        echo "ipipe_log_param_PR_whl_Size: $PR_whlSize" >> ${PADDLE_ROOT}/build/build_summary.txt
     fi
 }
 
@@ -433,7 +463,7 @@ function cmake_gen_and_build() {
     build $2
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s"
+    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 function build_mac() {
@@ -471,7 +501,7 @@ function cmake_gen_and_build_mac() {
     build_mac
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s"
+    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 function run_test() {
@@ -514,8 +544,7 @@ function run_brpc_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     if [[ ${WITH_TESTING:-ON} == "ON" \
-        && ${WITH_DISTRIBUTE:-OFF} == "ON" \
-        && ${WITH_GRPC:-OFF} == "OFF" ]] ; then
+        && ${WITH_DISTRIBUTE:-OFF} == "ON" ]] ; then
     cat <<EOF
     ========================================
     Running brpc unit tests ...
@@ -578,6 +607,8 @@ EOF
             pip3.7 uninstall -y paddlepaddle
         elif [ "$1" == "cp38-cp38" ]; then
             pip3.8 uninstall -y paddlepaddle
+        elif [ "$1" == "cp39-cp39" ]; then
+            pip3.9 uninstall -y paddlepaddle
         fi
         set -ex
 
@@ -593,6 +624,8 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp38-cp38" ]; then
             pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp39-cp39" ]; then
+            pip3.9 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         tmpfile_rand=`date +%s%N`
         tmpfile=$tmp_dir/$tmpfile_rand
@@ -600,7 +633,7 @@ EOF
         ut_startTime_s=`date +%s`
         get_quickly_disable_ut||disable_ut_quickly='' # indicate whether the case was in quickly disable list 
         if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
-            nightly_label=""
+            nightly_label="(NIGHTLY_LABEL)"
         else
             nightly_label="(RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY)"
             echo "========================================="
@@ -637,6 +670,13 @@ EOF
                     do
                         retry_unittests_record="$retry_unittests_record$failed_test_lists"
                         failed_test_lists_ult=`echo "${failed_test_lists}"`
+                        if [[ "${exec_times}" == "1" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
                         echo "========================================="
                         echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                         echo "========================================="
@@ -668,31 +708,26 @@ EOF
         #mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
         paddle version
         # Recovery proxy to avoid failure in later steps
-        set +x
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
         if [ "$mactest_error" != 0 ];then
             show_ut_retry_result
         fi
-        set -x
     fi
 }
 
 function get_precision_ut_mac() {
     on_precision=0
-    set -x
     UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
     precison_cases=""
     if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
         python3.7 $PADDLE_ROOT/tools/get_pr_ut.py
         if [[ -f "ut_list" ]]; then
-            set +x
             echo "PREC length: "`wc -l ut_list`
             precision_cases=`cat ut_list`
-            set -x
         fi
     fi
     if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
@@ -789,7 +824,7 @@ function generate_api_spec() {
 
     awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
     awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ] || [ "$1" == "cp39-cp39" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' $spec_path
         sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
@@ -834,7 +869,7 @@ function check_approvals_of_unittest() {
                 echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"
                 echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n"
                 echo "************************************"
-                exit 1
+                exit 6
             fi
         fi
     fi
@@ -975,12 +1010,12 @@ function case_count(){
 EOF
     testcases=$1
     num=$(echo $testcases|grep -o '\^'|wc -l)
-    if [ "$2" == "" ]; then
+    if (( $2 == -1 )); then
         echo "exclusive TestCases count is $num"
-        echo "ipipe_log_param_Exclusive_TestCases_Count: $num"
+        echo "ipipe_log_param_Exclusive_TestCases_Count: $num" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         echo "$2 card TestCases count is $num"
-        echo "ipipe_log_param_${2}_Cards_TestCases_Count: $num"
+        echo "ipipe_log_param_${2}_Cards_TestCases_Count: $num" >> ${PADDLE_ROOT}/build/build_summary.txt
     fi
 }
 
@@ -1018,27 +1053,27 @@ function card_test() {
     set -m
     case_count $1 $2
     ut_startTime_s=`date +%s` 
+
+    testcases=$1
+    cardnumber=$2
+    parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
+
     # get the CUDA device count, XPU device count is one
     if [ "${WITH_XPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
+    elif [ "${WITH_ROCM}" == "ON" ];then
+        CUDA_DEVICE_COUNT=4
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
 
-    testcases=$1
-    parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
-    if (( $# > 1 )); then
-        cardnumber=$2
-        if (( $cardnumber > $CUDA_DEVICE_COUNT )); then
-            cardnumber=$CUDA_DEVICE_COUNT
-        fi
-        if (( $# > 2 )); then
-            parallel_job=`expr $3 \* $parallel_level_base`
-        else
-            parallel_job=$parallel_level_base
-        fi
-    else
+    if (( $cardnumber == -1 ));then
         cardnumber=$CUDA_DEVICE_COUNT
+    fi
+
+    if (( $# > 2 )); then
+        parallel_job=`expr $3 \* $parallel_level_base`
+    else
         parallel_job=$parallel_level_base
     fi
 
@@ -1080,12 +1115,12 @@ function card_test() {
     done
     wait; # wait for all subshells to finish
     ut_endTime_s=`date +%s`
-    if [ "$2" == "" ]; then
+    if (( $2 == -1 )); then
         echo "exclusive TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         echo "$2 card TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
     fi
     set +m
 }
@@ -1135,13 +1170,18 @@ set -x
 set +x
         EXIT_CODE=0;
         test_cases=$(ctest -N -V) # get all test cases
-        single_card_tests_eight_parallel='^job$'    # cases list which would run 8 job each time with single GPU
-        single_card_tests_tetrad_parallel='^job$'   # cases list which would run 4 job each time with single GPU
-        single_card_tests_non_parallel_1='^job$'    # cases list which would run 1 job each time with single GPU
-        single_card_tests_non_parallel_2='^job$'    # cases list which would run 1 job each time with single GPU
-        single_card_tests='^job$' # all cases list which would take one graph card
-        exclusive_tests=''        # cases list which would be run exclusively
-        multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
+        # Note(zhouwei): Parallel runs are relative to 'CTEST_PARALLEL_LEVEL', e.g: '4 job each time' means 4*CTEST_PARALLEL_LEVEL
+        single_card_tests_high_parallel='^job$'     # cases list which would run the most job each time with single GPU
+        single_card_tests_two_parallel='^job$'      # cases list which would run 2 job each time with single GPU
+        single_card_tests_non_parallel='^job$'      # cases list which would run 1 job each time with single GPU
+        single_card_tests='^job$'                   # all cases list which would take single GPU
+        
+        multiple_card_tests_two_parallel='^job$'    # cases list which would run 2 job each time with multiple GPUs, most cases would be two GPUs
+        multiple_card_tests_non_parallel='^job$'    # cases list which would run 1 job each time with multiple GPUs, most cases would be two GPUs
+        
+        exclusive_tests_two_parallel='^job$'        # cases list which would run 2 job exclusively(with all GPUs)
+        exclusive_tests_non_parallel='^job$'        # cases list which would run 1 job exclusively(with all GPUs)
+        
         is_exclusive=''           # indicate whether the case is exclusive type
         is_multicard=''           # indicate whether the case is multiple GPUs type
         is_nightly=''             # indicate whether the case will only run at night
@@ -1149,9 +1189,10 @@ set +x
 
         UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
         output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
-        eight_parallel_job=$(echo $output | cut -d ";" -f 1)
-        tetrad_parallel_jog=$(echo $output | cut -d ";" -f 2)
-        non_parallel_job=$(echo $output | cut -d ";" -f 3)
+        cpu_parallel_job=$(echo $output | cut -d ";" -f 1)
+        tetrad_parallel_job=$(echo $output | cut -d ";" -f 2)
+        two_parallel_job=$(echo $output | cut -d ";" -f 3)
+        non_parallel_job=$(echo $output | cut -d ";" -f 4)
         while read -r line; do
             if [[ "$line" == "" ]]; then
                 continue
@@ -1193,26 +1234,24 @@ set +x
                 fi
 
                 if [[ "$is_exclusive" != "" ]]; then
-                    if [[ "$exclusive_tests" == "" ]]; then
-                        exclusive_tests="^$testcase$"
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                        exclusive_tests_two_parallel="$exclusive_tests_two_parallel|^$testcase$"
                     else
-                        exclusive_tests="$exclusive_tests|^$testcase$"
+                        exclusive_tests_non_parallel="$exclusive_tests_non_parallel|^$testcase$"
                     fi
                 elif [[ "$is_multicard" != "" ]]; then
-                    if [[ "$multiple_card_tests" == "" ]]; then
-                        multiple_card_tests="^$testcase$"
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                        multiple_card_tests_two_parallel="$multiple_card_tests_two_parallel|^$testcase$"
                     else
-                        multiple_card_tests="$multiple_card_tests|^$testcase$"
+                        multiple_card_tests_non_parallel="$multiple_card_tests_non_parallel|^$testcase$"
                     fi
                 else
-                    if [[ $(echo $eight_parallel_job | grep $testcase) != "" ]]; then
-                        single_card_tests_eight_parallel="$single_card_tests_eight_parallel|^$testcase$"
-                    elif [[ $(echo $tetrad_parallel_jog | grep $testcase) != "" ]]; then
-                        single_card_tests_tetrad_parallel="$single_card_tests_tetrad_parallel|^$testcase$"
-                    elif [[ "${#single_card_tests_non_parallel_1}" -gt 10000 ]];then
-                        single_card_tests_non_parallel_2="$single_card_tests_non_parallel_2|^$testcase$"
+                    if [[ $(echo $cpu_parallel_job | grep -o $testcase) != "" ]]; then
+                        single_card_tests_high_parallel="$single_card_tests_high_parallel|^$testcase$"
+                    elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                        single_card_tests_two_parallel="$single_card_tests_two_parallel|^$testcase$"
                     else
-                        single_card_tests_non_parallel_1="$single_card_tests_non_parallel_1|^$testcase$"
+                        single_card_tests_non_parallel="$single_card_tests_non_parallel|^$testcase$"
                     fi
                     single_card_tests="$single_card_tests|^$testcase$"
                 fi
@@ -1223,12 +1262,13 @@ set +x
                 testcase=''
         done <<< "$test_cases";
 
-        card_test "$single_card_tests_eight_parallel" 1 8     # run cases 8 job each time with single GPU
-        card_test "$single_card_tests_tetrad_parallel" 1 4    # run cases 4 job each time with single GPU
-        card_test "$single_card_tests_non_parallel_1" 1       # run cases 1 job each time with single GPU
-        card_test "$single_card_tests_non_parallel_2" 1       # run cases 1 job each time with single GPU
-        card_test "$multiple_card_tests" 2    # run cases with two GPUs
-        card_test "$exclusive_tests"          # run cases exclusively, in this cases would be run with 4/8 GPUs
+        card_test "$single_card_tests_high_parallel" 1 8        # run cases the most each time with single GPU
+        card_test "$single_card_tests_two_parallel" 1 2         # run cases 2 job each time with single GPU
+        card_test "$single_card_tests_non_parallel" 1           # run cases 1 job each time with single GPU
+        card_test "$multiple_card_tests_two_parallel" 2 2       # run cases 2 job each time with two GPUs
+        card_test "$multiple_card_tests_non_parallel" 2         # run cases 1 job each time with two GPUs
+        card_test "$exclusive_tests_two_parallel" -1 2          # run cases exclusively, in this cases would be run with 2/4/8 GPUs
+        card_test "$exclusive_tests_non_parallel" -1            # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         collect_failed_tests
         rm -f $tmp_dir/*
         exec_times=0
@@ -1238,6 +1278,9 @@ set +x
         exec_retry_threshold=10
         is_retry_execuate=0
         if [ -n "$failed_test_lists" ];then
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
             read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
             need_retry_ut_arr=(${need_retry_ut_str})
             need_retry_ut_count=${#need_retry_ut_arr[@]}
@@ -1247,11 +1290,18 @@ set +x
                     do
                         retry_unittests_record="$retry_unittests_record$failed_test_lists"
                         failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        if [[ "${exec_times}" == "1" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
                         echo "========================================="
                         echo "This is the ${exec_time_array[$exec_times]} time to re-run"
                         echo "========================================="
                         echo "The following unittest will be re-run:"
-                        echo "${failed_test_lists_ult}"
+                        echo "${retry_unittests}"
                             
                         for line in ${retry_unittests[@]} ;
                             do
@@ -1291,7 +1341,7 @@ set +x
                         fi
 
                         if [[ "$exclusive_retry" != "" ]]; then
-                            card_test "$exclusive_retry"
+                            card_test "$exclusive_retry" -1
                         fi
                         
                         exec_times=$[$exec_times+1]
@@ -1340,7 +1390,7 @@ function show_ut_retry_result() {
             echo "Summary Failed Tests... "
             echo "========================================"
             echo "The following tests FAILED: "
-            echo "${retry_unittests_record}" | grep -E "$failed_ut_re"
+            echo "${retry_unittests_record}" | sort -u | grep -E "$failed_ut_re"
             exit 8;
         fi
     fi
@@ -1406,7 +1456,8 @@ function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
-    if [ "$WITH_GPU" == "ON" ];then
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
+    if [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then
         parallel_test_base_gpu
     else
         if [ "$WITH_XPU" == "ON" ];then
@@ -1417,7 +1468,7 @@ function parallel_test() {
     fi
     ut_total_endTime_s=`date +%s`
     echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s"
-    echo "ipipe_log_param_TestCases_Total_Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s"
+    echo "ipipe_log_param_TestCases_Total_Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 function enable_unused_var_check() {
@@ -1511,12 +1562,14 @@ EOF
     ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
 
     ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
 
     if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
         ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
@@ -1524,11 +1577,13 @@ EOF
         ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
+        ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
         ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
+        ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
     fi
 
     #ref_paddle2_mv1=""
@@ -1649,6 +1704,22 @@ EOF
         apt-get clean -y && \
         rm -f ${ref_paddle38} && \
         ldconfig
+EOF
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.9.0/Python-3.9.0.tgz && \
+        tar -xzf Python-3.9.0.tgz && cd Python-3.9.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.9.0.tgz
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
+        wget ${ref_web}/${ref_paddle39} && pip3.9 install ${ref_paddle39_whl}; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f ${ref_paddle39} && \
+        ldconfig
 EOF
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # run paddle version to install python packages first
@@ -1697,7 +1768,7 @@ EOF
     fi
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s"
+    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 
     build_size "paddle_inference"
 }
@@ -1729,7 +1800,7 @@ EOF
     EXIT_CODE=$?
     fluid_endTime_s=`date +%s`
     echo "test_fluid_lib Total Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s"
-    echo "ipipe_log_param_Test_Fluid_Lib_Total_Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s"          
+    echo "ipipe_log_param_Test_Fluid_Lib_Total_Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt     
     ./clean.sh
     if [[ "$EXIT_CODE" != "0" ]]; then
         exit 8;
@@ -1776,17 +1847,17 @@ function example() {
 function collect_ccache_hits() {
     rate=$(ccache -s | grep 'cache hit rate' | awk '{print $4}')
     echo "ccache hit rate: ${rate}%"
-    echo "ipipe_log_param_Ccache_Hit_Rate: ${rate}%"
+    echo "ipipe_log_param_Ccache_Hit_Rate: ${rate}%" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 
 function test_op_benchmark() {
     # The PR will pass quickly when get approval from specific person.
-    # Xreki 12538138, luotao1 6836917, GaoWei8 53294385
+    # Xreki 12538138, luotao1 6836917, Avin0323 16167147
     set +x
     approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
     if [ "${approval_line}" != "" ]; then
-        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 53294385 12538138 6836917)
+        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 16167147 12538138 6836917)
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "TRUE" ]; then
             echo "==================================="
@@ -1799,6 +1870,10 @@ function test_op_benchmark() {
     bash ${PADDLE_ROOT}/tools/test_op_benchmark.sh
 }
 
+function test_model_benchmark() {
+    bash ${PADDLE_ROOT}/tools/test_model_benchmark.sh
+}
+
 function summary_check_problems() {
     set +x
     local check_style_code=$1
@@ -1833,10 +1908,6 @@ function main() {
     local CMD=$1 
     local parallel_number=$2
     init
-    if [ "$CMD" != "assert_file_approvals" ];then
-      python ${PADDLE_ROOT}/tools/summary_env.py
-      bash ${PADDLE_ROOT}/tools/get_cpu_info.sh
-    fi
     case $CMD in
       build_only)
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
@@ -1965,6 +2036,11 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      check_rocm_coverage)
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        parallel_test
+        check_coverage
+        ;;
       cmake_gen)
         cmake_gen ${PYTHON_ABI:-""}
         ;;
@@ -1988,11 +2064,20 @@ function main() {
       test_op_benchmark)
         test_op_benchmark
         ;;
+      test_model_benchmark)
+        test_model_benchmark
+        ;;
       *)
         print_usage
         exit 1
         ;;
       esac
+      set +x
+      if [[ -f ${PADDLE_ROOT}/build/build_summary.txt ]];then
+        echo "=====================build summary======================"
+        cat ${PADDLE_ROOT}/build/build_summary.txt
+        echo "========================================================"
+      fi
       echo "paddle_build script finished as expected"
 }
 
diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat
index 6f99c23ccd262f3cf15b1cac6b1c56a9cc2c79d8..9a2ed349e5b926529e5ec103b080fd7be3497c11 100644
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
@@ -61,8 +61,8 @@ echo Current directory : %cd%
 
 call:rest_env
 
-echo cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
-cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
+echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
+cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
 
 set  MSBUILDDISABLENODEREUSE=1
 
@@ -82,8 +82,8 @@ echo Current directory : %cd%
 
 call:rest_env
 
-echo cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
-cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
+echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
+cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
 
 set  MSBUILDDISABLENODEREUSE=1
 
@@ -107,8 +107,8 @@ echo Current directory : %cd%
 
 call:rest_env
 
-echo cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
-cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON  -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR%  -DCUDA_ARCH_NAME=All
+echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
+cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON  -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR%  -DCUDA_ARCH_NAME=All
 
 set  MSBUILDDISABLENODEREUSE=1
 
diff --git a/paddle/scripts/windows_build/config.ini b/paddle/scripts/windows_build/config.ini
index 32638d2873ca1dcaea53745e1eb343d5505935f5..750d7af8c29266d93bcc8d2049ec1091933285a8 100644
--- a/paddle/scripts/windows_build/config.ini
+++ b/paddle/scripts/windows_build/config.ini
@@ -11,7 +11,7 @@ http_proxy=#please edit your proxy#
 https_proxy=#please edit your proxy#
 
 # Just for example, please set by your windows environment
-vcvarsall_dir="C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat"
+vcvarsall_dir="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat"
 PYTHON3_PATH=C:\Python37
 
 CUDA_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0"
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index 38ed76a87cd3e46145d4a1a5e679174a41a4ee86..1f11be7e3c7260fb3878eb98284851097560dfd8 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/fluid/platform/npu_info.h"
 
 int main(int argc, char** argv) {
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
@@ -28,8 +29,7 @@ int main(int argc, char** argv) {
 
   std::vector<std::string> envs;
   std::vector<std::string> undefok;
-#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC) && \
-    !defined(PADDLE_WITH_PSLIB)
+#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_PSLIB)
   std::string str_max_body_size;
   if (::GFLAGS_NAMESPACE::GetCommandLineOption("max_body_size",
                                                &str_max_body_size)) {
@@ -38,11 +38,13 @@ int main(int argc, char** argv) {
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_ASCEND_CL)
   envs.push_back("fraction_of_gpu_memory_to_use");
   envs.push_back("initial_gpu_memory_in_mb");
   envs.push_back("reallocate_gpu_memory_in_mb");
   envs.push_back("allocator_strategy");
+  envs.push_back("selected_gpus");
 #elif __clang__
   envs.push_back("use_mkldnn");
   envs.push_back("initial_cpu_memory_in_mb");
@@ -61,6 +63,10 @@ int main(int argc, char** argv) {
   undefok.push_back("initial_cpu_memory_in_mb");
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+  envs.push_back("selected_npus");
+#endif
+
   char* env_str = nullptr;
   if (envs.size() > 0) {
     std::string env_string = "--tryfromenv=";
@@ -93,6 +99,10 @@ int main(int argc, char** argv) {
 
   int ret = RUN_ALL_TESTS();
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  paddle::platform::AclInstance::Instance().Finalize();
+#endif
+
   if (env_str) free(env_str);
   if (undefok_str) free(undefok_str);
 
diff --git a/patches/eigen/BinaryFunctors.h b/patches/eigen/BinaryFunctors.h
deleted file mode 100644
index 54d0395507a12226767bb73d44dc8b36919f4262..0000000000000000000000000000000000000000
--- a/patches/eigen/BinaryFunctors.h
+++ /dev/null
@@ -1,509 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// clang-format off
-
-#ifndef EIGEN_BINARY_FUNCTORS_H
-#define EIGEN_BINARY_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- associative binary functors ----------
-
-template<typename Arg1, typename Arg2>
-struct binary_op_base
-{
-  typedef Arg1 first_argument_type;
-  typedef Arg2 second_argument_type;
-};
-
-/** \internal
-  * \brief Template functor to compute the sum of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_sum_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-#else
-  scalar_sum_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::padd(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2, // rough estimate!
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
-    // TODO vectorize mixed sum
-  };
-};
-
-/** \internal
-  * \brief Template specialization to deprecate the summation of boolean expressions.
-  * This is required to solve Bug 426.
-  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
-  */
-template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
-  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
-  scalar_sum_op() {}
-};
-
-
-/** \internal
-  * \brief Template functor to compute the product of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_product_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-#else
-  scalar_product_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmul(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_mul(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-    // TODO vectorize mixed product
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the conjugate product of two scalars
-  *
-  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-
-  enum {
-    Conj = NumTraits<LhsScalar>::IsComplex
-  };
-  
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
-  
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-  
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = NumTraits<LhsScalar>::MulCost,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the min of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_min(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_min_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the max of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_max_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_max(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_max_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
-  };
-};
-
-/** \internal
-  * \brief Template functors for comparison of two scalars
-  * \todo Implement packet-comparisons
-  */
-template<typename LhsScalar, typename RhsScalar, ComparisonName cmp> struct scalar_cmp_op;
-
-template<typename LhsScalar, typename RhsScalar, ComparisonName cmp>
-struct functor_traits<scalar_cmp_op<LhsScalar,RhsScalar, cmp> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = false
-  };
-};
-
-template<ComparisonName Cmp, typename LhsScalar, typename RhsScalar>
-struct result_of<scalar_cmp_op<LhsScalar, RhsScalar, Cmp>(LhsScalar,RhsScalar)> {
-  typedef bool type;
-};
-
-
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_EQ> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LT> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_LE> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GT> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_GE> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_UNORD> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);}
-};
-template<typename LhsScalar, typename RhsScalar>
-struct scalar_cmp_op<LhsScalar,RhsScalar, cmp_NEQ> : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef bool result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;}
-};
-
-
-/** \internal
-  * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
-  *
-  * \sa MatrixBase::stableNorm(), class Redux
-  */
-template<typename Scalar>
-struct scalar_hypot_op<Scalar,Scalar> : binary_op_base<Scalar,Scalar>
-{
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const
-  {
-    // This functor is used by hypotNorm only for which it is faster to first apply abs
-    // on all coefficients prior to reduction through hypot.
-    // This way we avoid calling abs on positive and real entries, and this also permits
-    // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
-    // through the same functor...
-    return internal::positive_real_hypot(x,y);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar,Scalar> > {
-  enum
-  {
-    Cost = 3 * NumTraits<Scalar>::AddCost +
-           2 * NumTraits<Scalar>::MulCost +
-           2 * scalar_div_cost<Scalar,false>::value,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the pow of two scalars
-  */
-template<typename Scalar, typename Exponent>
-struct scalar_pow_op  : binary_op_base<Scalar,Exponent>
-{
-  typedef typename ScalarBinaryOpTraits<Scalar,Exponent,scalar_pow_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op)
-#else
-  scalar_pow_op() {
-    typedef Scalar LhsScalar;
-    typedef Exponent RhsScalar;
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC
-  inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); }
-};
-template<typename Scalar, typename Exponent>
-struct functor_traits<scalar_pow_op<Scalar,Exponent> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-
-
-
-//---------- non associative binary functors ----------
-
-/** \internal
-  * \brief Template functor to compute the difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator-
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_difference_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-#else
-  scalar_difference_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::psub(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the quotient of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator/()
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_quotient_op  : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_quotient_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-#else
-  scalar_quotient_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pdiv(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
-  typedef typename scalar_quotient_op<LhsScalar,RhsScalar>::result_type result_type;
-  enum {
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
-    Cost = scalar_div_cost<result_type,PacketAccess>::value
-  };
-};
-
-
-
-/** \internal
-  * \brief Template functor to compute the and of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator&&
-  */
-struct scalar_boolean_and_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-};
-template<> struct functor_traits<scalar_boolean_and_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the or of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator||
-  */
-struct scalar_boolean_or_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-};
-template<> struct functor_traits<scalar_boolean_or_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the xor of two booleans
- *
- * \sa class CwiseBinaryOp, ArrayBase::operator^
- */
-struct scalar_boolean_xor_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
-};
-template<> struct functor_traits<scalar_boolean_xor_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the absolute difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::absolute_difference
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_absolute_difference_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_absolute_difference_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op)
-#else
-  scalar_absolute_difference_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return numext::absdiff(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pabsdiff(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_absolute_difference_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAbsDiff
-  };
-};
-
-
-
-//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
-
-// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value.
-// They are analogues to std::binder1st/binder2nd but with the following differences:
-//  - they are compatible with packetOp
-//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
-template<typename BinaryOp> struct bind1st_op : BinaryOp {
-
-  typedef typename BinaryOp::first_argument_type  first_argument_type;
-  typedef typename BinaryOp::second_argument_type second_argument_type;
-  typedef typename BinaryOp::result_type          result_type;
-
-  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
-
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const
-  { return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b); }
-
-  first_argument_type m_value;
-};
-template<typename BinaryOp> struct functor_traits<bind1st_op<BinaryOp> > : functor_traits<BinaryOp> {};
-
-
-template<typename BinaryOp> struct bind2nd_op : BinaryOp {
-
-  typedef typename BinaryOp::first_argument_type  first_argument_type;
-  typedef typename BinaryOp::second_argument_type second_argument_type;
-  typedef typename BinaryOp::result_type          result_type;
-
-  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
-
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return BinaryOp::packetOp(a,internal::pset1<Packet>(m_value)); }
-
-  second_argument_type m_value;
-};
-template<typename BinaryOp> struct functor_traits<bind2nd_op<BinaryOp> > : functor_traits<BinaryOp> {};
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_BINARY_FUNCTORS_H
-
-// clang-format on
diff --git a/patches/eigen/Geometry_SSE.h b/patches/eigen/Geometry_SSE.h
deleted file mode 100644
index f45d5eb8a01ffebaea802b757891e8eb5cc52870..0000000000000000000000000000000000000000
--- a/patches/eigen/Geometry_SSE.h
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_GEOMETRY_SSE_H
-#define EIGEN_GEOMETRY_SSE_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float> {
-  enum {
-    AAlignment = traits<Derived>::Alignment,
-    BAlignment = traits<OtherDerived>::Alignment,
-    ResAlignment = traits<Quaternion<float>>::Alignment
-  };
-  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a,
-                                      const QuaternionBase<OtherDerived>& _b) {
-    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
-    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
-    Quaternion<float> res;
-    const __m128 mask = _mm_setr_ps(0.f, 0.f, 0.f, -0.f);
-    __m128 a = ae.template packet<AAlignment, __m128>(0);
-    __m128 b = be.template packet<BAlignment, __m128>(0);
-    __m128 s1 =
-        pmul(vec4f_swizzle1(a, 1, 2, 0, 2), vec4f_swizzle1(b, 2, 0, 1, 2));
-    __m128 s2 =
-        pmul(vec4f_swizzle1(a, 3, 3, 3, 1), vec4f_swizzle1(b, 0, 1, 2, 1));
-    pstoret<float, __m128, ResAlignment>(
-        &res.x(),
-        padd(psub(pmul(a, vec4f_swizzle1(b, 3, 3, 3, 3)),
-                  pmul(vec4f_swizzle1(a, 2, 0, 1, 0),
-                       vec4f_swizzle1(b, 1, 2, 0, 0))),
-             pxor(mask, padd(s1, s2))));
-
-    return res;
-  }
-};
-
-template <class Derived>
-struct quat_conj<Architecture::SSE, Derived, float> {
-  enum { ResAlignment = traits<Quaternion<float>>::Alignment };
-  static inline Quaternion<float> run(const QuaternionBase<Derived>& q) {
-    evaluator<typename Derived::Coefficients> qe(q.coeffs());
-    Quaternion<float> res;
-    const Packet4f mask = _mm_setr_ps(-0.f, -0.f, -0.f, 0.f);
-    pstoret<float, Packet4f, ResAlignment>(
-        &res.x(),
-        pxor(mask,
-             qe.template packet<traits<Derived>::Alignment, Packet4f>(0)));
-    return res;
-  }
-};
-
-template <typename VectorLhs, typename VectorRhs>
-struct cross3_impl<Architecture::SSE, VectorLhs, VectorRhs, float, true> {
-  enum {
-    ResAlignment =
-        traits<typename plain_matrix_type<VectorLhs>::type>::Alignment
-  };
-  static inline typename plain_matrix_type<VectorLhs>::type run(
-      const VectorLhs& lhs, const VectorRhs& rhs) {
-    evaluator<VectorLhs> lhs_eval(lhs);
-    evaluator<VectorRhs> rhs_eval(rhs);
-    __m128 a =
-        lhs_eval.template packet<traits<VectorLhs>::Alignment, __m128>(0);
-    __m128 b =
-        rhs_eval.template packet<traits<VectorRhs>::Alignment, __m128>(0);
-    __m128 mul1 =
-        pmul(vec4f_swizzle1(a, 1, 2, 0, 3), vec4f_swizzle1(b, 2, 0, 1, 3));
-    __m128 mul2 =
-        pmul(vec4f_swizzle1(a, 2, 0, 1, 3), vec4f_swizzle1(b, 1, 2, 0, 3));
-    typename plain_matrix_type<VectorLhs>::type res;
-    pstoret<float, __m128, ResAlignment>(&res.x(), psub(mul1, mul2));
-    return res;
-  }
-};
-
-template <class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double> {
-  enum {
-    BAlignment = traits<OtherDerived>::Alignment,
-    ResAlignment = traits<Quaternion<double>>::Alignment
-  };
-
-  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a,
-                                       const QuaternionBase<OtherDerived>& _b) {
-    const Packet2d mask =
-        _mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0));
-
-    Quaternion<double> res;
-
-    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
-    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
-
-    const double* a = _a.coeffs().data();
-    Packet2d b_xy = be.template packet<BAlignment, Packet2d>(0);
-    Packet2d b_zw = be.template packet<BAlignment, Packet2d>(2);
-    Packet2d a_xx = pset1<Packet2d>(a[0]);
-    Packet2d a_yy = pset1<Packet2d>(a[1]);
-    Packet2d a_zz = pset1<Packet2d>(a[2]);
-    Packet2d a_ww = pset1<Packet2d>(a[3]);
-
-    // two temporaries:
-    Packet2d t1, t2;
-
-    /*
-     * t1 = ww*xy + yy*zw
-     * t2 = zz*xy - xx*zw
-     * res.xy = t1 +/- swap(t2)
-     */
-    t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
-    t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
-#ifdef EIGEN_VECTORIZE_SSE3
-    EIGEN_UNUSED_VARIABLE(mask)
-    pstoret<double, Packet2d, ResAlignment>(&res.x(),
-                                            _mm_addsub_pd(t1, preverse(t2)));
-#else
-    pstoret<double, Packet2d, ResAlignment>(&res.x(),
-                                            padd(t1, pxor(mask, preverse(t2))));
-#endif
-
-    /*
-     * t1 = ww*zw - yy*xy
-     * t2 = zz*zw + xx*xy
-     * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
-     */
-    t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
-    t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
-#ifdef EIGEN_VECTORIZE_SSE3
-    EIGEN_UNUSED_VARIABLE(mask)
-    pstoret<double, Packet2d, ResAlignment>(
-        &res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
-#else
-    pstoret<double, Packet2d, ResAlignment>(&res.z(),
-                                            psub(t1, pxor(mask, preverse(t2))));
-#endif
-
-    return res;
-  }
-};
-
-template <class Derived>
-struct quat_conj<Architecture::SSE, Derived, double> {
-  enum { ResAlignment = traits<Quaternion<double>>::Alignment };
-  static inline Quaternion<double> run(const QuaternionBase<Derived>& q) {
-    evaluator<typename Derived::Coefficients> qe(q.coeffs());
-    Quaternion<double> res;
-    const Packet2d mask0 = _mm_setr_pd(-0., -0.);
-    const Packet2d mask2 = _mm_setr_pd(-0., 0.);
-    pstoret<double, Packet2d, ResAlignment>(
-        &res.x(),
-        pxor(mask0,
-             qe.template packet<traits<Derived>::Alignment, Packet2d>(0)));
-    pstoret<double, Packet2d, ResAlignment>(
-        &res.z(),
-        pxor(mask2,
-             qe.template packet<traits<Derived>::Alignment, Packet2d>(2)));
-    return res;
-  }
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_GEOMETRY_SSE_H
diff --git a/patches/eigen/Half.h b/patches/eigen/Half.h
deleted file mode 100644
index 2d4e0164b5906fe05769d5f98a9ec812c162c52b..0000000000000000000000000000000000000000
--- a/patches/eigen/Half.h
+++ /dev/null
@@ -1,733 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-//
-// The conversion routines are Copyright (c) Fabian Giesen, 2016.
-// The original license follows:
-//
-// Copyright (c) Fabian Giesen, 2016
-// All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted.
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting from CUDA's __half struct) with
-// operator overloads such that it behaves basically as an arithmetic
-// type. It will be quite slow on CPUs (so it is recommended to stay
-// in fp32 for CPUs, except for simple parameter conversions, I/O
-// to disk and the likes), but fast on GPUs.
-
-#ifndef EIGEN_HALF_CUDA_H
-#define EIGEN_HALF_CUDA_H
-
-#if __cplusplus > 199711L
-#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
-#else
-#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type()
-#endif
-
-namespace Eigen {
-
-struct half;
-
-namespace half_impl {
-
-#if !defined(EIGEN_HAS_CUDA_FP16)
-// Make our own __half_raw definition that is similar to CUDA's.
-struct __half_raw {
-  EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
-  explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
-  unsigned short x;
-};
-#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
-// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
-typedef __half __half_raw;
-#endif
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
-raw_uint16_to_half(unsigned short x);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
-
-struct half_base : public __half_raw {
-  EIGEN_DEVICE_FUNC half_base() {}
-  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
-  EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
-    EIGEN_CUDACC_VER >= 90000
-  EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
-#endif
-};
-
-}  // namespace half_impl
-
-// Class definition.
-struct half : public half_impl::half_base {
-#if !defined(EIGEN_HAS_CUDA_FP16) || \
-    (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
-  typedef half_impl::__half_raw __half_raw;
-#endif
-
-  EIGEN_DEVICE_FUNC half() {}
-
-  EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
-  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \
-    EIGEN_CUDACC_VER >= 90000
-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
-#endif
-
-  explicit EIGEN_DEVICE_FUNC half(bool b)
-      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
-  template <class T>
-  explicit EIGEN_DEVICE_FUNC half(const T& val)
-      : half_impl::half_base(
-            half_impl::float_to_half_rtne(static_cast<float>(val))) {}
-  explicit EIGEN_DEVICE_FUNC half(float f)
-      : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const {
-    // +0.0 and -0.0 become false, everything else becomes true.
-    return (x & 0x7fff) != 0;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const {
-    return static_cast<signed char>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const {
-    return static_cast<unsigned char>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const {
-    return static_cast<short>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const {
-    return static_cast<unsigned short>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const {
-    return static_cast<int>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const {
-    return static_cast<unsigned int>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const {
-    return static_cast<long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
-    return static_cast<unsigned long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
-    return static_cast<long long>(half_impl::half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
-    return static_cast<unsigned long long>(half_to_float(*this));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
-    return half_impl::half_to_float(*this);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
-    return static_cast<double>(half_impl::half_to_float(*this));
-  }
-
-  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
-    x = other.x;
-    return *this;
-  }
-};
-
-namespace half_impl {
-
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-
-// Intrinsics for native fp16 support. Note that on current hardware,
-// these are no faster than fp32 arithmetic (you need to use the half2
-// versions to get the ALU speed increased), but you do save the
-// conversion steps back and forth.
-
-EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-  return __hadd(::__half(a), ::__half(b));
-#else
-  return __hadd(a, b);
-#endif
-}
-EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) {
-  return __hmul(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) {
-  return __hsub(a, b);
-}
-EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
-#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
-  return __hdiv(a, b);
-#else
-  float num = __half2float(a);
-  float denom = __half2float(b);
-  return __float2half(num / denom);
-#endif
-}
-EIGEN_STRONG_INLINE __device__ half operator-(const half& a) {
-  return __hneg(a);
-}
-EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
-  a = a + b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) {
-  a = a * b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) {
-  a = a - b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) {
-  a = a / b;
-  return a;
-}
-EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) {
-  return __heq(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) {
-  return __hne(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) {
-  return __hlt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) {
-  return __hle(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) {
-  return __hgt(a, b);
-}
-EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) {
-  return __hge(a, b);
-}
-
-#else  // Emulate support for half floats
-
-// Definitions for CPUs and older CUDA, mostly working through conversion
-// to/from fp32.
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a,
-                                                     const half& b) {
-  return half(float(a) + float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a,
-                                                     const half& b) {
-  return half(float(a) * float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a,
-                                                     const half& b) {
-  return half(float(a) - float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a,
-                                                     const half& b) {
-  return half(float(a) / float(b));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) {
-  half result;
-  result.x = a.x ^ 0x8000;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
-  a = half(float(a) + float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
-  a = half(float(a) * float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
-  a = half(float(a) - float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
-  a = half(float(a) / float(b));
-  return a;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a,
-                                                      const half& b) {
-  return float(a) == float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a,
-                                                      const half& b) {
-  return float(a) != float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a,
-                                                     const half& b) {
-  return float(a) < float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a,
-                                                      const half& b) {
-  return float(a) <= float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a,
-                                                     const half& b) {
-  return float(a) > float(b);
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a,
-                                                      const half& b) {
-  return float(a) >= float(b);
-}
-
-#endif  // Emulate support for half floats
-
-// Division by an index. Do it in full float precision to avoid accuracy
-// issues in converting the denominator to half.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) {
-  return half(static_cast<float>(a) / static_cast<float>(b));
-}
-
-// Conversion routines, including fallbacks for the host or older CUDA.
-// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
-// these in hardware. If we need more performance on older/other CPUs, they are
-// also possible to vectorize directly.
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw
-raw_uint16_to_half(unsigned short x) {
-  __half_raw h;
-  h.x = x;
-  return h;
-}
-
-union FP32 {
-  unsigned int u;
-  float f;
-};
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 300
-  __half tmp_ff = __float2half(ff);
-  return *(__half_raw*)&tmp_ff;
-
-#elif defined(EIGEN_HAS_FP16_C)
-  __half_raw h;
-  h.x = _cvtss_sh(ff, 0);
-  return h;
-
-#else
-  FP32 f;
-  f.f = ff;
-
-  const FP32 f32infty = {255 << 23};
-  const FP32 f16max = {(127 + 16) << 23};
-  const FP32 denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
-  unsigned int sign_mask = 0x80000000u;
-  __half_raw o;
-  o.x = static_cast<unsigned short>(0x0u);
-
-  unsigned int sign = f.u & sign_mask;
-  f.u ^= sign;
-
-  // NOTE all the integer compares in this function can be safely
-  // compiled into signed compares since all operands are below
-  // 0x80000000. Important if you want fast straight SSE2 code
-  // (since there's no unsigned PCMPGTD).
-
-  if (f.u >= f16max.u) {  // result is Inf or NaN (all exponent bits set)
-    o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
-  } else {                    // (De)normalized number or zero
-    if (f.u < (113 << 23)) {  // resulting FP16 is subnormal or zero
-      // use a magic value to align our 10 mantissa bits at the bottom of
-      // the float. as long as FP addition is round-to-nearest-even this
-      // just works.
-      f.f += denorm_magic.f;
-
-      // and one integer subtract of the bias later, we have our final float!
-      o.x = static_cast<unsigned short>(f.u - denorm_magic.u);
-    } else {
-      unsigned int mant_odd = (f.u >> 13) & 1;  // resulting mantissa is odd
-
-      // update exponent, rounding bias part 1
-      f.u += ((unsigned int)(15 - 127) << 23) + 0xfff;
-      // rounding bias part 2
-      f.u += mant_odd;
-      // take the bits!
-      o.x = static_cast<unsigned short>(f.u >> 13);
-    }
-  }
-
-  o.x |= static_cast<unsigned short>(sign >> 16);
-  return o;
-#endif
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 300
-  return __half2float(h);
-
-#elif defined(EIGEN_HAS_FP16_C)
-  return _cvtsh_ss(h.x);
-
-#else
-  const FP32 magic = {113 << 23};
-  const unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
-  FP32 o;
-
-  o.u = (h.x & 0x7fff) << 13;            // exponent/mantissa bits
-  unsigned int exp = shifted_exp & o.u;  // just the exponent
-  o.u += (127 - 15) << 23;               // exponent adjust
-
-  // handle exponent special cases
-  if (exp == shifted_exp) {   // Inf/NaN?
-    o.u += (128 - 16) << 23;  // extra exp adjust
-  } else if (exp == 0) {      // Zero/Denormal?
-    o.u += 1 << 23;           // extra exp adjust
-    o.f -= magic.f;           // renormalize
-  }
-
-  o.u |= (h.x & 0x8000) << 16;  // sign bit
-  return o.f;
-#endif
-}
-
-// --- standard functions ---
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
-  return (a.x & 0x7fff) == 0x7c00;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return __hisnan(a);
-#else
-  return (a.x & 0x7fff) > 0x7c00;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
-  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
-  half result;
-  result.x = a.x & 0x7FFF;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 530
-  return half(hexp(a));
-#else
-  return half(::expf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
-  return half(numext::expm1(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && \
-    defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
-  return half(::hlog(a));
-#else
-  return half(::logf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) {
-  return half(numext::log1p(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
-  return half(::log10f(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 530
-  return half(hsqrt(a));
-#else
-  return half(::sqrtf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
-  return half(::powf(float(a), float(b)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) {
-  return half(::sinf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) {
-  return half(::cosf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) {
-  return half(::tanf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
-  return half(::tanhf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 300
-  return half(hfloor(a));
-#else
-  return half(::floorf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \
-    EIGEN_CUDA_ARCH >= 300
-  return half(hceil(a));
-#else
-  return half(::ceilf(float(a)));
-#endif
-}
-
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return __hlt(b, a) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f2 < f1 ? b : a;
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) {
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return __hlt(a, b) ? b : a;
-#else
-  const float f1 = static_cast<float>(a);
-  const float f2 = static_cast<float>(b);
-  return f1 < f2 ? b : a;
-#endif
-}
-
-EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) {
-  os << static_cast<float>(v);
-  return os;
-}
-
-}  // end namespace half_impl
-
-// import Eigen::half_impl::half into Eigen namespace
-// using half_impl::half;
-
-namespace internal {
-
-template <>
-struct random_default_impl<half, false, false> {
-  static inline half run(const half& x, const half& y) {
-    return x + (y - x) * half(float(std::rand()) / float(RAND_MAX));
-  }
-  static inline half run() { return run(half(-1.f), half(1.f)); }
-};
-
-template <>
-struct is_arithmetic<half> {
-  enum { value = true };
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-namespace std {
-template <>
-struct numeric_limits<Eigen::half> {
-  static const bool is_specialized = true;
-  static const bool is_signed = true;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = true;
-  static const bool has_quiet_NaN = true;
-  static const bool has_signaling_NaN = true;
-  static const float_denorm_style has_denorm = denorm_present;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_to_nearest;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 11;
-  static const int digits10 = 3;      // according to
-                                      // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int max_digits10 = 5;  // according to
-                                      // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
-  static const int radix = 2;
-  static const int min_exponent = -13;
-  static const int min_exponent10 = -4;
-  static const int max_exponent = 16;
-  static const int max_exponent10 = 4;
-  static const bool traps = true;
-  static const bool tinyness_before = false;
-
-  static Eigen::half(min)() {
-    return Eigen::half_impl::raw_uint16_to_half(0x400);
-  }
-  static Eigen::half lowest() {
-    return Eigen::half_impl::raw_uint16_to_half(0xfbff);
-  }
-  static Eigen::half(max)() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7bff);
-  }
-  static Eigen::half epsilon() {
-    return Eigen::half_impl::raw_uint16_to_half(0x0800);
-  }
-  static Eigen::half round_error() { return Eigen::half(0.5); }
-  static Eigen::half infinity() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7c00);
-  }
-  static Eigen::half quiet_NaN() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7e00);
-  }
-  static Eigen::half signaling_NaN() {
-    return Eigen::half_impl::raw_uint16_to_half(0x7e00);
-  }
-  static Eigen::half denorm_min() {
-    return Eigen::half_impl::raw_uint16_to_half(0x1);
-  }
-};
-}
-
-namespace Eigen {
-
-template <>
-struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
-  enum {
-    IsSigned = true,
-    IsInteger = false,
-    IsComplex = false,
-    RequireInitialization = false
-  };
-
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() {
-    return half_impl::raw_uint16_to_half(0x0800);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
-    return Eigen::half(1e-2f);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() {
-    return half_impl::raw_uint16_to_half(0x7bff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() {
-    return half_impl::raw_uint16_to_half(0xfbff);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() {
-    return half_impl::raw_uint16_to_half(0x7c00);
-  }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
-    return half_impl::raw_uint16_to_half(0x7c01);
-  }
-};
-
-}  // end namespace Eigen
-
-// C-like standard mathematical functions and trancendentals.
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) {
-  Eigen::half result;
-  result.x = a.x & 0x7FFF;
-  return result;
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
-  return Eigen::half(::expf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
-    EIGEN_CUDA_ARCH >= 530
-  return Eigen::half(::hlog(a));
-#else
-  return Eigen::half(::logf(float(a)));
-#endif
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) {
-  return Eigen::half(::sqrtf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a,
-                                                       const Eigen::half& b) {
-  return Eigen::half(::powf(float(a), float(b)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) {
-  return Eigen::half(::floorf(float(a)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) {
-  return Eigen::half(::ceilf(float(a)));
-}
-
-namespace std {
-
-#if __cplusplus > 199711L
-template <>
-struct hash<Eigen::half> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(
-      const Eigen::half& a) const {
-    return static_cast<std::size_t>(a.x);
-  }
-};
-#endif
-
-}  // end namespace std
-
-// Add the missing shfl_xor intrinsic
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
-__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var,
-                                                      int laneMask,
-                                                      int width = warpSize) {
-#if EIGEN_CUDACC_VER < 90000
-  return static_cast<Eigen::half>(
-      __shfl_xor(static_cast<float>(var), laneMask, width));
-#else
-  return static_cast<Eigen::half>(
-      __shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
-#endif
-}
-#endif
-
-// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(
-    const Eigen::half* ptr) {
-  return Eigen::half_impl::raw_uint16_to_half(
-      __ldg(reinterpret_cast<const unsigned short*>(ptr)));
-}
-#endif
-
-#if defined(EIGEN_CUDA_ARCH)
-namespace Eigen {
-namespace numext {
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
-  return (half_impl::isnan)(h);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) {
-  return (half_impl::isinf)(h);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
-  return (half_impl::isfinite)(h);
-}
-
-}  // namespace Eigen
-}  // namespace numext
-#endif
-
-#endif  // EIGEN_HALF_CUDA_H
diff --git a/patches/eigen/MathFunctions.h b/patches/eigen/MathFunctions.h
deleted file mode 100644
index 9f6a4d0e3328ffbe864273b69dc0033d50b813ae..0000000000000000000000000000000000000000
--- a/patches/eigen/MathFunctions.h
+++ /dev/null
@@ -1,1938 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MATHFUNCTIONS_H
-#define EIGEN_MATHFUNCTIONS_H
-
-// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html
-// TODO this should better be moved to NumTraits
-#define EIGEN_PI \
-  3.141592653589793238462643383279502884197169399375105820974944592307816406L
-
-namespace Eigen {
-
-// On WINCE, std::abs is defined for int only, so let's defined our own
-// overloads:
-// This issue has been confirmed with MSVC 2008 only, but the issue might exist
-// for more recent versions too.
-#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC <= 1500
-long abs(long x) { return (labs(x)); }
-double abs(double x) { return (fabs(x)); }
-float abs(float x) { return (fabsf(x)); }
-long double abs(long double x) { return (fabsl(x)); }
-#endif
-
-namespace internal {
-
-/** \internal \class global_math_functions_filtering_base
-  *
-  * What it does:
-  * Defines a typedef 'type' as follows:
-  * - if type T has a member typedef
- * Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then
-  *   global_math_functions_filtering_base<T>::type is a typedef for it.
-  * - otherwise, global_math_functions_filtering_base<T>::type is a typedef for
- * T.
-  *
-  * How it's used:
-  * To allow to defined the global math functions (like sin...) in certain
- * cases, like the Array expressions.
-  * When you do sin(array1+array2), the object array1+array2 has a complicated
- * expression type, all what you want to know
-  * is that it inherits ArrayBase. So we implement a partial specialization of
- * sin_impl for ArrayBase<Derived>.
-  * So we must make sure to use sin_impl<ArrayBase<Derived> > and not
- * sin_impl<Derived>, otherwise our partial specialization
-  * won't be used. How does sin know that? That's exactly what
- * global_math_functions_filtering_base tells it.
-  *
-  * How it's implemented:
-  * SFINAE in the style of enable_if. Highly susceptible of breaking compilers.
- * With GCC, it sure does work, but if you replace
-  * the typename dummy by an integer template parameter, it doesn't work
- * anymore!
-  */
-
-template <typename T, typename dummy = void>
-struct global_math_functions_filtering_base {
-  typedef T type;
-};
-
-template <typename T>
-struct always_void {
-  typedef void type;
-};
-
-template <typename T>
-struct global_math_functions_filtering_base<
-    T,
-    typename always_void<
-        typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>::
-        type> {
-  typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type;
-};
-
-#define EIGEN_MATHFUNC_IMPL(func, scalar)                             \
-  Eigen::internal::func##_impl<                                       \
-      typename Eigen::internal::global_math_functions_filtering_base< \
-          scalar>::type>
-#define EIGEN_MATHFUNC_RETVAL(func, scalar)                           \
-  typename Eigen::internal::func##_retval<                            \
-      typename Eigen::internal::global_math_functions_filtering_base< \
-          scalar>::type>::type
-
-/****************************************************************************
-* Implementation of real                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct real_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) { return x; }
-};
-
-template <typename Scalar>
-struct real_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    using std::real;
-    return real(x);
-  }
-};
-
-template <typename Scalar>
-struct real_impl : real_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template <typename T>
-struct real_impl<std::complex<T>> {
-  typedef T RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline T run(const std::complex<T>& x) { return x.real(); }
-};
-#endif
-
-template <typename Scalar>
-struct real_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of imag                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct imag_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar&) { return RealScalar(0); }
-};
-
-template <typename Scalar>
-struct imag_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    using std::imag;
-    return imag(x);
-  }
-};
-
-template <typename Scalar>
-struct imag_impl : imag_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template <typename T>
-struct imag_impl<std::complex<T>> {
-  typedef T RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline T run(const std::complex<T>& x) { return x.imag(); }
-};
-#endif
-
-template <typename Scalar>
-struct imag_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of real_ref                                             *
-****************************************************************************/
-
-template <typename Scalar>
-struct real_ref_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar& run(Scalar& x) {
-    return reinterpret_cast<RealScalar*>(&x)[0];
-  }
-  EIGEN_DEVICE_FUNC
-  static inline const RealScalar& run(const Scalar& x) {
-    return reinterpret_cast<const RealScalar*>(&x)[0];
-  }
-};
-
-template <typename Scalar>
-struct real_ref_retval {
-  typedef typename NumTraits<Scalar>::Real& type;
-};
-
-/****************************************************************************
-* Implementation of imag_ref                                             *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex>
-struct imag_ref_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar& run(Scalar& x) {
-    return reinterpret_cast<RealScalar*>(&x)[1];
-  }
-  EIGEN_DEVICE_FUNC
-  static inline const RealScalar& run(const Scalar& x) {
-    return reinterpret_cast<RealScalar*>(&x)[1];
-  }
-};
-
-template <typename Scalar>
-struct imag_ref_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(Scalar&) { return Scalar(0); }
-  EIGEN_DEVICE_FUNC
-  static inline const Scalar run(const Scalar&) { return Scalar(0); }
-};
-
-template <typename Scalar>
-struct imag_ref_impl
-    : imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
-
-template <typename Scalar>
-struct imag_ref_retval {
-  typedef typename NumTraits<Scalar>::Real& type;
-};
-
-/****************************************************************************
-* Implementation of conj                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_default_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) { return x; }
-};
-
-template <typename Scalar>
-struct conj_default_impl<Scalar, true> {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    using std::conj;
-    return conj(x);
-  }
-};
-
-template <typename Scalar>
-struct conj_impl : conj_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template <typename T>
-struct conj_impl<std::complex<T>> {
-  EIGEN_DEVICE_FUNC
-  static inline std::complex<T> run(const std::complex<T>& x) {
-    return std::complex<T>(x.real(), -x.imag());
-  }
-};
-#endif
-
-template <typename Scalar>
-struct conj_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of abs2                                                 *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex>
-struct abs2_impl_default {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) { return x * x; }
-};
-
-template <typename Scalar>
-struct abs2_impl_default<Scalar, true>  // IsComplex
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    return x.real() * x.real() + x.imag() * x.imag();
-  }
-};
-
-template <typename Scalar>
-struct abs2_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    return abs2_impl_default<Scalar, NumTraits<Scalar>::IsComplex>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct abs2_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of norm1                                                *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex>
-struct norm1_default_impl;
-
-template <typename Scalar>
-struct norm1_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    EIGEN_USING_STD_MATH(abs);
-    return abs(x.real()) + abs(x.imag());
-  }
-};
-
-template <typename Scalar>
-struct norm1_default_impl<Scalar, false> {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_USING_STD_MATH(abs);
-    return abs(x);
-  }
-};
-
-template <typename Scalar>
-struct norm1_impl : norm1_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
-
-template <typename Scalar>
-struct norm1_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of hypot                                                *
-****************************************************************************/
-
-template <typename Scalar>
-struct hypot_impl;
-
-template <typename Scalar>
-struct hypot_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of cast                                                 *
-****************************************************************************/
-
-template <typename OldType, typename NewType>
-struct cast_impl {
-  EIGEN_DEVICE_FUNC
-  static inline NewType run(const OldType& x) {
-    return static_cast<NewType>(x);
-  }
-};
-
-// here, for once, we're plainly returning NewType: we don't want cast to do
-// weird things.
-
-template <typename OldType, typename NewType>
-EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x) {
-  return cast_impl<OldType, NewType>::run(x);
-}
-
-/****************************************************************************
-* Implementation of round                                                   *
-****************************************************************************/
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename Scalar>
-struct round_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
-                        NUMERIC_TYPE_MUST_BE_REAL)
-    EIGEN_USING_STD_MATH(round);
-    return round(x);
-  }
-};
-#else
-template <typename Scalar>
-struct round_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
-                        NUMERIC_TYPE_MUST_BE_REAL)
-    EIGEN_USING_STD_MATH(floor);
-    EIGEN_USING_STD_MATH(ceil);
-    return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5));
-  }
-};
-#endif
-
-template <typename Scalar>
-struct round_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of rint                                                    *
-****************************************************************************/
-
-template <typename Scalar>
-struct rint_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
-                        NUMERIC_TYPE_MUST_BE_REAL)
-#if EIGEN_HAS_CXX11_MATH
-    EIGEN_USING_STD_MATH(rint);
-#endif
-    return rint(x);
-  }
-};
-
-#if !EIGEN_HAS_CXX11_MATH
-template <>
-struct rint_impl<double> {
-  EIGEN_DEVICE_FUNC
-  static inline double run(const double& x) { return ::rint(x); }
-};
-template <>
-struct rint_impl<float> {
-  EIGEN_DEVICE_FUNC
-  static inline float run(const float& x) { return ::rintf(x); }
-};
-#endif
-
-template <typename Scalar>
-struct rint_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of arg                                                     *
-****************************************************************************/
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename Scalar>
-struct arg_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-    // HIP does not seem to have a native device side implementation for the
-    // math routine "arg"
-    using std::arg;
-#else
-    EIGEN_USING_STD_MATH(arg);
-#endif
-    return arg(x);
-  }
-};
-#else
-template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct arg_default_impl {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
-  }
-};
-
-template <typename Scalar>
-struct arg_default_impl<Scalar, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_DEVICE_FUNC
-  static inline RealScalar run(const Scalar& x) {
-    EIGEN_USING_STD_MATH(arg);
-    return arg(x);
-  }
-};
-
-template <typename Scalar>
-struct arg_impl : arg_default_impl<Scalar> {};
-#endif
-
-template <typename Scalar>
-struct arg_retval {
-  typedef typename NumTraits<Scalar>::Real type;
-};
-
-/****************************************************************************
-* Implementation of expm1                                                   *
-****************************************************************************/
-
-// This implementation is based on GSL Math's expm1.
-namespace std_fallback {
-// fallback expm1 implementation in case there is no expm1(Scalar) function in
-// namespace of Scalar,
-// or that there is no suitable std::expm1 function available. Implementation
-// attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
-  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-
-  EIGEN_USING_STD_MATH(exp);
-  Scalar u = exp(x);
-  if (numext::equal_strict(u, Scalar(1))) {
-    return x;
-  }
-  Scalar um1 = u - RealScalar(1);
-  if (numext::equal_strict(um1, Scalar(-1))) {
-    return RealScalar(-1);
-  }
-
-  EIGEN_USING_STD_MATH(log);
-  Scalar logu = log(u);
-  return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
-}
-}
-
-template <typename Scalar>
-struct expm1_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-#if EIGEN_HAS_CXX11_MATH
-    using std::expm1;
-#else
-    using std_fallback::expm1;
-#endif
-    return expm1(x);
-  }
-};
-
-// Specialization for complex types that are not supported by std::expm1.
-template <typename RealScalar>
-struct expm1_impl<std::complex<RealScalar>> {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    RealScalar xr = x.real();
-    RealScalar xi = x.imag();
-    // expm1(z) = exp(z) - 1
-    //          = exp(x +  i * y) - 1
-    //          = exp(x) * (cos(y) + i * sin(y)) - 1
-    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
-    // Imag(expm1(z)) = exp(x) * sin(y)
-    // Real(expm1(z)) = exp(x) * cos(y) - 1
-    //          = exp(x) * cos(y) - 1.
-    //          = expm1(x) + exp(x) * (cos(y) - 1)
-    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
-
-    // TODO better use numext::expm1 and numext::sin (but that would require
-    // forward declarations or moving this specialization down).
-    RealScalar erm1 = expm1_impl<RealScalar>::run(xr);
-    RealScalar er = erm1 + RealScalar(1.);
-    EIGEN_USING_STD_MATH(sin);
-    RealScalar sin2 = sin(xi / RealScalar(2.));
-    sin2 = sin2 * sin2;
-    RealScalar s = sin(xi);
-    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
-    return std::complex<RealScalar>(real_part, er * s);
-  }
-};
-
-template <typename Scalar>
-struct expm1_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of log1p                                                   *
-****************************************************************************/
-
-namespace std_fallback {
-// fallback log1p implementation in case there is no log1p(Scalar) function in
-// namespace of Scalar,
-// or that there is no suitable std::log1p function available
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
-  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  EIGEN_USING_STD_MATH(log);
-  Scalar x1p = RealScalar(1) + x;
-  Scalar log_1p = log(x1p);
-  const bool is_small = numext::equal_strict(x1p, Scalar(1));
-  const bool is_inf = numext::equal_strict(x1p, log_1p);
-  return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
-}
-}
-
-template <typename Scalar>
-struct log1p_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-#if EIGEN_HAS_CXX11_MATH
-    using std::log1p;
-#else
-    using std_fallback::log1p;
-#endif
-    return log1p(x);
-  }
-};
-
-// Specialization for complex types that are not supported by std::log1p.
-template <typename RealScalar>
-struct log1p_impl<std::complex<RealScalar>> {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    return std_fallback::log1p(x);
-  }
-};
-
-template <typename Scalar>
-struct log1p_retval {
-  typedef Scalar type;
-};
-
-/****************************************************************************
-* Implementation of pow                                                  *
-****************************************************************************/
-
-template <typename ScalarX,
-          typename ScalarY,
-          bool IsInteger =
-              NumTraits<ScalarX>::IsInteger&& NumTraits<ScalarY>::IsInteger>
-struct pow_impl {
-  // typedef Scalar retval;
-  typedef typename ScalarBinaryOpTraits<
-      ScalarX,
-      ScalarY,
-      internal::scalar_pow_op<ScalarX, ScalarY>>::ReturnType result_type;
-  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x,
-                                                  const ScalarY& y) {
-    EIGEN_USING_STD_MATH(pow);
-    return pow(x, y);
-  }
-};
-
-template <typename ScalarX, typename ScalarY>
-struct pow_impl<ScalarX, ScalarY, true> {
-  typedef ScalarX result_type;
-  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y) {
-    ScalarX res(1);
-    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
-    if (y & 1) res *= x;
-    y >>= 1;
-    while (y) {
-      x *= x;
-      if (y & 1) res *= x;
-      y >>= 1;
-    }
-    return res;
-  }
-};
-
-/****************************************************************************
-* Implementation of random                                               *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex, bool IsInteger>
-struct random_default_impl {};
-
-template <typename Scalar>
-struct random_impl : random_default_impl<Scalar,
-                                         NumTraits<Scalar>::IsComplex,
-                                         NumTraits<Scalar>::IsInteger> {};
-
-template <typename Scalar>
-struct random_retval {
-  typedef Scalar type;
-};
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
-    random(const Scalar& x, const Scalar& y);
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
-
-template <typename Scalar>
-struct random_default_impl<Scalar, false, false> {
-  static inline Scalar run(const Scalar& x, const Scalar& y) {
-    return x + (y - x) * Scalar(std::rand()) / Scalar(RAND_MAX);
-  }
-  static inline Scalar run() {
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
-  }
-};
-
-enum {
-  meta_floor_log2_terminate,
-  meta_floor_log2_move_up,
-  meta_floor_log2_move_down,
-  meta_floor_log2_bogus
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2_selector {
-  enum {
-    middle = (lower + upper) / 2,
-    value = (upper <= lower + 1)
-                ? int(meta_floor_log2_terminate)
-                : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
-                                      : (n == 0) ? int(meta_floor_log2_bogus)
-                                                 : int(meta_floor_log2_move_up)
-  };
-};
-
-template <unsigned int n,
-          int lower = 0,
-          int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-          int selector = meta_floor_log2_selector<n, lower, upper>::value>
-struct meta_floor_log2 {};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down> {
-  enum {
-    value = meta_floor_log2<
-        n,
-        lower,
-        meta_floor_log2_selector<n, lower, upper>::middle>::value
-  };
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up> {
-  enum {
-    value = meta_floor_log2<n,
-                            meta_floor_log2_selector<n, lower, upper>::middle,
-                            upper>::value
-  };
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate> {
-  enum {
-    value = (n >= ((unsigned int)(1) << (lower + 1))) ? lower + 1 : lower
-  };
-};
-
-template <unsigned int n, int lower, int upper>
-struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus> {
-  // no value, error at compile time
-};
-
-template <typename Scalar>
-struct random_default_impl<Scalar, false, true> {
-  static inline Scalar run(const Scalar& x, const Scalar& y) {
-    if (y <= x) return x;
-    // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
-    typedef typename make_unsigned<Scalar>::type ScalarU;
-    // ScalarX is the widest of ScalarU and unsigned int.
-    // We'll deal only with ScalarX and unsigned int below thus avoiding signed
-    // types and arithmetic and signed overflows (which are undefined behavior).
-    typedef typename conditional<(ScalarU(-1) > unsigned(-1)),
-                                 ScalarU,
-                                 unsigned>::type ScalarX;
-    // The following difference doesn't overflow, provided our integer types are
-    // two's
-    // complement and have the same number of padding bits in signed and
-    // unsigned variants.
-    // This is the case in most modern implementations of C++.
-    ScalarX range = ScalarX(y) - ScalarX(x);
-    ScalarX offset = 0;
-    ScalarX divisor = 1;
-    ScalarX multiplier = 1;
-    const unsigned rand_max = RAND_MAX;
-    if (range <= rand_max)
-      divisor = (rand_max + 1) / (range + 1);
-    else
-      multiplier = 1 + range / (rand_max + 1);
-    // Rejection sampling.
-    do {
-      offset = (unsigned(std::rand()) * multiplier) / divisor;
-    } while (offset > range);
-    return Scalar(ScalarX(x) + offset);
-  }
-
-  static inline Scalar run() {
-#ifdef EIGEN_MAKING_DOCS
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
-#else
-    enum {
-        rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX) + 1>::value,
-        scalar_bits = sizeof(Scalar) * CHAR_BIT,
-        shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
-        offset = NumTraits<Scalar>::IsSigned
-                     ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits, scalar_bits) - 1))
-                     : 0};
-    return Scalar((std::rand() >> shift) - offset);
-#endif
-  }
-};
-
-template <typename Scalar>
-struct random_default_impl<Scalar, true, false> {
-  static inline Scalar run(const Scalar& x, const Scalar& y) {
-    return Scalar(random(x.real(), y.real()), random(x.imag(), y.imag()));
-  }
-  static inline Scalar run() {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    return Scalar(random<RealScalar>(), random<RealScalar>());
-  }
-};
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar)
-    random(const Scalar& x, const Scalar& y) {
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
-}
-
-template <typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() {
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
-}
-
-// Implementation of is* functions
-
-// std::is* do not work with fast-math and gcc, std::is* are available on MSVC
-// 2013 and newer, as well as in clang.
-#if (EIGEN_HAS_CXX11_MATH &&                               \
-     !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || \
-    (EIGEN_COMP_MSVC >= 1800) || (EIGEN_COMP_CLANG)
-#define EIGEN_USE_STD_FPCLASSIFY 1
-#else
-#define EIGEN_USE_STD_FPCLASSIFY 0
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
-    isnan_impl(const T&) {
-  return false;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
-    isinf_impl(const T&) {
-  return false;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<internal::is_integral<T>::value, bool>::type
-    isfinite_impl(const T&) {
-  return true;
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<(!internal::is_integral<T>::value) &&
-                                     (!NumTraits<T>::IsComplex),
-                                 bool>::type
-    isfinite_impl(const T& x) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return (::isfinite)(x);
-#elif EIGEN_USE_STD_FPCLASSIFY
-  using std::isfinite;
-  return isfinite EIGEN_NOT_A_MACRO(x);
-#else
-  return x <= NumTraits<T>::highest() && x >= NumTraits<T>::lowest();
-#endif
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<(!internal::is_integral<T>::value) &&
-                                     (!NumTraits<T>::IsComplex),
-                                 bool>::type
-    isinf_impl(const T& x) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return (::isinf)(x);
-#elif EIGEN_USE_STD_FPCLASSIFY
-  using std::isinf;
-  return isinf EIGEN_NOT_A_MACRO(x);
-#else
-  return x > NumTraits<T>::highest() || x < NumTraits<T>::lowest();
-#endif
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC
-    typename internal::enable_if<(!internal::is_integral<T>::value) &&
-                                     (!NumTraits<T>::IsComplex),
-                                 bool>::type
-    isnan_impl(const T& x) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return (::isnan)(x);
-#elif EIGEN_USE_STD_FPCLASSIFY
-  using std::isnan;
-  return isnan EIGEN_NOT_A_MACRO(x);
-#else
-  return x != x;
-#endif
-}
-
-#if (!EIGEN_USE_STD_FPCLASSIFY)
-
-#if EIGEN_COMP_MSVC
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) {
-  return _fpclass(x) == _FPCLASS_NINF || _fpclass(x) == _FPCLASS_PINF;
-}
-
-// MSVC defines a _isnan builtin function, but for double only
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) {
-  return _isnan(x) != 0;
-}
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) {
-  return _isnan(x) != 0;
-}
-EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) {
-  return _isnan(x) != 0;
-}
-
-EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) {
-  return isinf_msvc_helper(x);
-}
-EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) {
-  return isinf_msvc_helper(x);
-}
-EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) {
-  return isinf_msvc_helper(x);
-}
-
-#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC)
-
-#if EIGEN_GNUC_AT_LEAST(5, 0)
-#define EIGEN_TMP_NOOPT_ATTRIB \
-  EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only")))
-#else
-// NOTE the inline qualifier and noinline attribute are both needed: the former
-// is to avoid linking issue (duplicate symbol),
-//      while the second prevent too aggressive optimizations in fast-math mode:
-#define EIGEN_TMP_NOOPT_ATTRIB \
-  EIGEN_DEVICE_FUNC inline     \
-      __attribute__((noinline, optimize("no-finite-math-only")))
-#endif
-
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) {
-  return __builtin_isnan(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) {
-  return __builtin_isnan(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) {
-  return __builtin_isnan(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) {
-  return __builtin_isinf(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) {
-  return __builtin_isinf(x);
-}
-template <>
-EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) {
-  return __builtin_isinf(x);
-}
-
-#undef EIGEN_TMP_NOOPT_ATTRIB
-
-#endif
-
-#endif
-
-// The following overload are defined at the end of this file
-template <typename T>
-EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x);
-template <typename T>
-EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
-template <typename T>
-EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
-
-template <typename T>
-T generic_fast_tanh_float(const T& a_x);
-}  // end namespace internal
-
-/****************************************************************************
-* Generic math functions                                                    *
-****************************************************************************/
-
-namespace numext {
-
-#if (!defined(EIGEN_GPUCC))
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
-  EIGEN_USING_STD_MATH(min);
-  return min EIGEN_NOT_A_MACRO(x, y);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
-  EIGEN_USING_STD_MATH(max);
-  return max EIGEN_NOT_A_MACRO(x, y);
-}
-#else
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
-  return y < x ? y : x;
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float mini(const float& x,
-                                                 const float& y) {
-  return fminf(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double mini(const double& x,
-                                                  const double& y) {
-  return fmin(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x,
-                                                       const long double& y) {
-#if defined(EIGEN_HIPCC)
-  // no "fminl" on HIP yet
-  return (x < y) ? x : y;
-#else
-  return fminl(x, y);
-#endif
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
-  return x < y ? y : x;
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float maxi(const float& x,
-                                                 const float& y) {
-  return fmaxf(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double maxi(const double& x,
-                                                  const double& y) {
-  return fmax(x, y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x,
-                                                       const long double& y) {
-#if defined(EIGEN_HIPCC)
-  // no "fmaxl" on HIP yet
-  return (x > y) ? x : y;
-#else
-  return fmaxl(x, y);
-#endif
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC)  \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC)  \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC)     \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC)     \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(                \
-    NAME, FUNC, RET_TYPE)                                                  \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
-
-#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE)     \
-  template <>                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
-    return cl::sycl::FUNC(x);                                              \
-  }
-
-#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(                                   \
-    NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2)                             \
-  template <>                                                               \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x,   \
-                                                      const ARG_TYPE2& y) { \
-    return cl::sycl::FUNC(x, y);                                            \
-  }
-
-#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
-  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
-
-#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
-
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar)
-    real(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
-    EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)>::type
-real_ref(const Scalar& x) {
-  return internal::real_ref_impl<Scalar>::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)
-    real_ref(Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar)
-    imag(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(arg, Scalar)
-    arg(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type<
-    EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)>::type
-imag_ref(const Scalar& x) {
-  return internal::imag_ref_impl<Scalar>::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)
-    imag_ref(Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(conj, Scalar)
-    conj(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar)
-    abs2(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
-}
-
-EIGEN_DEVICE_FUNC
-inline bool abs2(bool x) { return x; }
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) {
-  return x > y ? x - y : y - x;
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float absdiff(const float& x,
-                                                    const float& y) {
-  return fabsf(x - y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double absdiff(const double& x,
-                                                     const double& y) {
-  return fabs(x - y);
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff(
-    const long double& x, const long double& y) {
-#if defined(EIGEN_HIPCC)
-  // no "fabsl" on HIP yet
-  return (x > y) ? x : y;
-#else
-  return fabsl(x - y);
-#endif
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar)
-    norm1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar)
-    hypot(const Scalar& x, const Scalar& y) {
-  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar)
-    log1p(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float& x) {
-  return ::log1pf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log1p(const double& x) {
-  return ::log1p(x);
-}
-#endif
-
-template <typename ScalarX, typename ScalarY>
-EIGEN_DEVICE_FUNC inline
-    typename internal::pow_impl<ScalarX, ScalarY>::result_type
-    pow(const ScalarX& x, const ScalarY& y) {
-  return internal::pow_impl<ScalarX, ScalarY>::run(x, y);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool(isnan)(const T& x) {
-  return internal::isnan_impl(x);
-}
-template <typename T>
-EIGEN_DEVICE_FUNC bool(isinf)(const T& x) {
-  return internal::isinf_impl(x);
-}
-template <typename T>
-EIGEN_DEVICE_FUNC bool(isfinite)(const T& x) {
-  return internal::isfinite_impl(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(rint, Scalar)
-    rint(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar)
-    round(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC T(floor)(const T& x) {
-  EIGEN_USING_STD_MATH(floor);
-  return floor(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float& x) {
-  return ::floorf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double floor(const double& x) {
-  return ::floor(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC T(ceil)(const T& x) {
-  EIGEN_USING_STD_MATH(ceil);
-  return ceil(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float& x) {
-  return ::ceilf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double ceil(const double& x) {
-  return ::ceil(x);
-}
-#endif
-
-/** Log base 2 for 32 bits positive integers.
-  * Conveniently returns 0 for x==0. */
-inline int log2(int x) {
-  eigen_assert(x >= 0);
-  unsigned int v(x);
-  static const int table[32] = {0,  9,  1,  10, 13, 21, 2,  29, 11, 14, 16,
-                                18, 22, 25, 3,  30, 8,  12, 20, 28, 15, 17,
-                                24, 7,  19, 27, 23, 6,  26, 5,  4,  31};
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  return table[(v * 0x07C4ACDDU) >> 27];
-}
-
-/** \returns the square root of \a x.
-  *
-  * It is essentially equivalent to
-  * \code using std::sqrt; return sqrt(x); \endcode
-  * but slightly faster for float/double and some compilers (e.g., gcc), thanks
- * to
-  * specializations when SSE is enabled.
-  *
-  * It's usage is justified in performance critical functions, like
- * norm/normalize.
-  */
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sqrt(const T& x) {
-  EIGEN_USING_STD_MATH(sqrt);
-  return sqrt(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T& x) {
-  EIGEN_USING_STD_MATH(log);
-  return log(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float& x) {
-  return ::logf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double& x) {
-  return ::log(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename internal::enable_if<NumTraits<T>::IsSigned ||
-                                     NumTraits<T>::IsComplex,
-                                 typename NumTraits<T>::Real>::type
-    abs(const T& x) {
-  EIGEN_USING_STD_MATH(abs);
-  return abs(x);
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-    typename internal::enable_if<!(NumTraits<T>::IsSigned ||
-                                   NumTraits<T>::IsComplex),
-                                 typename NumTraits<T>::Real>::type
-    abs(const T& x) {
-  return x;
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float& x) {
-  return ::fabsf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(const double& x) {
-  return ::fabs(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const std::complex<float>& x) {
-  return ::hypotf(x.real(), x.imag());
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(
-    const std::complex<double>& x) {
-  return ::hypot(x.real(), x.imag());
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T& x) {
-  EIGEN_USING_STD_MATH(exp);
-  return exp(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float& x) {
-  return ::expf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double& x) {
-  return ::exp(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(
-    const std::complex<float>& x) {
-  float com = ::expf(x.real());
-  float res_real = com * ::cosf(x.imag());
-  float res_imag = com * ::sinf(x.imag());
-  return std::complex<float>(res_real, res_imag);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(
-    const std::complex<double>& x) {
-  double com = ::exp(x.real());
-  double res_real = com * ::cos(x.imag());
-  double res_imag = com * ::sin(x.imag());
-  return std::complex<double>(res_real, res_imag);
-}
-#endif
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar)
-    expm1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float& x) {
-  return ::expm1f(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double expm1(const double& x) {
-  return ::expm1(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T& x) {
-  EIGEN_USING_STD_MATH(cos);
-  return cos(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos, cos)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float& x) {
-  return ::cosf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cos(const double& x) {
-  return ::cos(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T& x) {
-  EIGEN_USING_STD_MATH(sin);
-  return sin(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float& x) {
-  return ::sinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sin(const double& x) {
-  return ::sin(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T& x) {
-  EIGEN_USING_STD_MATH(tan);
-  return tan(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float& x) {
-  return ::tanf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tan(const double& x) {
-  return ::tan(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T& x) {
-  EIGEN_USING_STD_MATH(acos);
-  return acos(x);
-}
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T& x) {
-  EIGEN_USING_STD_MATH(acosh);
-  return acosh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float& x) {
-  return ::acosf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double acos(const double& x) {
-  return ::acos(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T& x) {
-  EIGEN_USING_STD_MATH(asin);
-  return asin(x);
-}
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T& x) {
-  EIGEN_USING_STD_MATH(asinh);
-  return asinh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float& x) {
-  return ::asinf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double asin(const double& x) {
-  return ::asin(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T& x) {
-  EIGEN_USING_STD_MATH(atan);
-  return atan(x);
-}
-
-#if EIGEN_HAS_CXX11_MATH
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T& x) {
-  EIGEN_USING_STD_MATH(atanh);
-  return atanh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float& x) {
-  return ::atanf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double atan(const double& x) {
-  return ::atan(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T& x) {
-  EIGEN_USING_STD_MATH(cosh);
-  return cosh(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float& x) {
-  return ::coshf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cosh(const double& x) {
-  return ::cosh(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T& x) {
-  EIGEN_USING_STD_MATH(sinh);
-  return sinh(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float& x) {
-  return ::sinhf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sinh(const double& x) {
-  return ::sinh(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T& x) {
-  EIGEN_USING_STD_MATH(tanh);
-  return tanh(x);
-}
-
-#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) {
-  return internal::generic_fast_tanh_float(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float& x) {
-  return ::tanhf(x);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tanh(const double& x) {
-  return ::tanh(x);
-}
-#endif
-
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) {
-  EIGEN_USING_STD_MATH(fmod);
-  return fmod(a, b);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a,
-                                                 const float& b) {
-  return ::fmodf(a, b);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double fmod(const double& a,
-                                                  const double& b) {
-  return ::fmod(a, b);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
-#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
-#undef SYCL_SPECIALIZE_UNARY_FUNC
-#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
-#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
-#undef SYCL_SPECIALIZE_BINARY_FUNC
-#endif
-
-}  // end namespace numext
-
-namespace internal {
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x) {
-  return (numext::isfinite)(numext::real(x)) &&
-         (numext::isfinite)(numext::imag(x));
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x) {
-  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
-}
-
-template <typename T>
-EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x) {
-  return ((numext::isinf)(numext::real(x)) ||
-          (numext::isinf)(numext::imag(x))) &&
-         (!(numext::isnan)(x));
-}
-
-/****************************************************************************
-* Implementation of fuzzy comparisons                                       *
-****************************************************************************/
-
-template <typename Scalar, bool IsComplex, bool IsInteger>
-struct scalar_fuzzy_default_impl {};
-
-template <typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, false, false> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
-      const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
-    return numext::abs(x) <= numext::abs(y) * prec;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(const Scalar& x,
-                              const Scalar& y,
-                              const RealScalar& prec) {
-    return numext::abs(x - y) <=
-           numext::mini(numext::abs(x), numext::abs(y)) * prec;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApproxOrLessThan(const Scalar& x,
-                                        const Scalar& y,
-                                        const RealScalar& prec) {
-    return x <= y || isApprox(x, y, prec);
-  }
-};
-
-template <typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, false, true> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x,
-                                                         const Scalar&,
-                                                         const RealScalar&) {
-    return x == Scalar(0);
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(const Scalar& x,
-                              const Scalar& y,
-                              const RealScalar&) {
-    return x == y;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApproxOrLessThan(const Scalar& x,
-                                        const Scalar& y,
-                                        const RealScalar&) {
-    return x <= y;
-  }
-};
-
-template <typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, true, false> {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(
-      const Scalar& x, const OtherScalar& y, const RealScalar& prec) {
-    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
-  }
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(const Scalar& x,
-                              const Scalar& y,
-                              const RealScalar& prec) {
-    return numext::abs2(x - y) <=
-           numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
-  }
-};
-
-template <typename Scalar>
-struct scalar_fuzzy_impl
-    : scalar_fuzzy_default_impl<Scalar,
-                                NumTraits<Scalar>::IsComplex,
-                                NumTraits<Scalar>::IsInteger> {};
-
-template <typename Scalar, typename OtherScalar>
-EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan(
-    const Scalar& x,
-    const OtherScalar& y,
-    const typename NumTraits<Scalar>::Real& precision =
-        NumTraits<Scalar>::dummy_precision()) {
-  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(
-      x, y, precision);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline bool isApprox(
-    const Scalar& x,
-    const Scalar& y,
-    const typename NumTraits<Scalar>::Real& precision =
-        NumTraits<Scalar>::dummy_precision()) {
-  return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan(
-    const Scalar& x,
-    const Scalar& y,
-    const typename NumTraits<Scalar>::Real& precision =
-        NumTraits<Scalar>::dummy_precision()) {
-  return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
-}
-
-/******************************************
-***  The special case of the  bool type ***
-******************************************/
-
-template <>
-struct random_impl<bool> {
-  static inline bool run() { return random<int>(0, 1) == 0 ? false : true; }
-};
-
-template <>
-struct scalar_fuzzy_impl<bool> {
-  typedef bool RealScalar;
-
-  template <typename OtherScalar>
-  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x,
-                                                         const bool&,
-                                                         const bool&) {
-    return !x;
-  }
-
-  EIGEN_DEVICE_FUNC
-  static inline bool isApprox(bool x, bool y, bool) { return x == y; }
-
-  EIGEN_DEVICE_FUNC
-  static inline bool isApproxOrLessThan(const bool& x,
-                                        const bool& y,
-                                        const bool&) {
-    return (!x) || y;
-  }
-};
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_MATHFUNCTIONS_H
diff --git a/patches/eigen/Meta.h b/patches/eigen/Meta.h
index d7f5cbd240a4abba150edb69248206b7f6ccf093..b7b789a19c4e9a4a48c2000379d98c568c94aee9 100755
--- a/patches/eigen/Meta.h
+++ b/patches/eigen/Meta.h
@@ -8,8 +8,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-// clang-format off
-
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
@@ -27,8 +25,40 @@
 
 #endif
 
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+// Recent versions of ICC require <cstdint> for pointer types below.
+#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11)
+
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT
 #include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint8_t  uint8_t;
+typedef std::int8_t   int8_t;
+typedef std::uint16_t uint16_t;
+typedef std::int16_t  int16_t;
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provide the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint8_t  uint8_t;
+typedef ::int8_t   int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t  int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
 #endif
 
 namespace Eigen {
@@ -54,13 +84,14 @@ namespace internal {
 
 // Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
 // and older versions do not provide *intptr_t types.
-#if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
+#if EIGEN_ICC_NEEDS_CSTDINT
 typedef std::intptr_t  IntPtr;
 typedef std::uintptr_t UIntPtr;
 #else
 typedef std::ptrdiff_t IntPtr;
 typedef std::size_t UIntPtr;
 #endif
+#undef EIGEN_ICC_NEEDS_CSTDINT
 
 struct true_type {  enum { value = 1 }; };
 struct false_type { enum { value = 0 }; };
@@ -162,6 +193,16 @@ template<> struct make_unsigned<unsigned long>    { typedef unsigned long type;
 template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
 template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
 #endif
+
+// Some platforms define int64_t as long long even for C++03. In this case we
+// are missing the definition for make_unsigned. If we just define it, we get
+// duplicated definitions for platforms defining int64_t as signed long for
+// C++03. We therefore add the specialization for C++03 long long for these
+// platforms only.
+#if EIGEN_OS_MAC
+template<> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
+template<> struct make_unsigned<long long>          { typedef unsigned long long type; };
+#endif
 #endif
 
 template <typename T> struct add_const { typedef const T type; };
@@ -360,6 +401,15 @@ template<> struct numeric_limits<unsigned long long>
   EIGEN_DEVICE_FUNC
   static unsigned long long (min)() { return 0; }
 };
+template<> struct numeric_limits<bool>
+{
+  EIGEN_DEVICE_FUNC
+  static bool epsilon() { return false; }
+  EIGEN_DEVICE_FUNC
+  static bool (max)() { return true; }
+  EIGEN_DEVICE_FUNC
+  static bool (min)() { return false; }
+};
 
 }
 
@@ -431,13 +481,29 @@ template<typename T,std::size_t N>
 Index size(const T (&) [N]) { return N; }
 
 /** \internal
-  * Convenient struct to get the result type of a unary or binary functor.
-  *
-  * It supports both the current STL mechanism (using the result_type member) as well as
-  * upcoming next STL generation (using a templated result member).
-  * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
+  * Convenient struct to get the result type of a nullary, unary, binary, or
+  * ternary functor.
+  * 
+  * Pre C++11:
+  * Supports both a Func::result_type member and templated
+  * Func::result<Func(ArgTypes...)>::type member.
+  * 
+  * If none of these members is provided, then the type of the first
+  * argument is returned.
+  * 
+  * Post C++11:
+  * This uses std::result_of. However, note the `type` member removes
+  * const and converts references/pointers to their corresponding value type.
   */
-#if EIGEN_HAS_STD_RESULT_OF
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename T> struct result_of;
+
+template<typename F, typename... ArgTypes>
+struct result_of<F(ArgTypes...)> {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_STD_RESULT_OF
 template<typename T> struct result_of {
   typedef typename std::result_of<T>::type type1;
   typedef typename remove_all<type1>::type type;
@@ -449,6 +515,28 @@ struct has_none {int a[1];};
 struct has_std_result_type {int a[2];};
 struct has_tr1_result {int a[3];};
 
+template<typename Func, int SizeOf>
+struct nullary_result_of_select {};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_tr1_result)> {typedef typename Func::template result<Func()>::type type;};
+
+template<typename Func>
+struct result_of<Func()> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T()>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename nullary_result_of_select<Func, FunctorType>::type type;
+};
+
 template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
 struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
 
@@ -518,6 +606,45 @@ struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
     enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
     typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
 };
+
+#endif
+
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_CXX11
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename result_of<F(ArgTypes...)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename F, typename ArgType0 = void, typename ArgType1 = void, typename ArgType2 = void>
+struct invoke_result {
+  typedef typename result_of<F(ArgType0, ArgType1, ArgType2)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F>
+struct invoke_result<F, void, void, void> {
+  typedef typename result_of<F()>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0>
+struct invoke_result<F, ArgType0, void, void> {
+  typedef typename result_of<F(ArgType0)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0, typename ArgType1>
+struct invoke_result<F, ArgType0, ArgType1, void> {
+  typedef typename result_of<F(ArgType0, ArgType1)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
 #endif
 
 struct meta_yes { char a[1]; };
@@ -672,51 +799,8 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
 bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
 #endif
 
-/** \internal extract the bits of the float \a x */
-inline unsigned int as_uint(float x)
-{
-  unsigned int ret;
-  std::memcpy(&ret, &x, sizeof(float));
-  return ret;
-}
-
 } // end namespace numext
 
 } // end namespace Eigen
 
-// Define portable (u)int{32,64} types
-#if EIGEN_HAS_CXX11
-#include <cstdint>
-namespace Eigen {
-namespace numext {
-typedef std::uint8_t  uint8_t;
-typedef std::int8_t   int8_t;
-typedef std::uint16_t uint16_t;
-typedef std::int16_t  int16_t;
-typedef std::uint32_t uint32_t;
-typedef std::int32_t  int32_t;
-typedef std::uint64_t uint64_t;
-typedef std::int64_t  int64_t;
-}
-}
-#else
-// Without c++11, all compilers able to compile Eigen also
-// provides the C99 stdint.h header file.
-#include <stdint.h>
-namespace Eigen {
-namespace numext {
-typedef ::uint8_t  uint8_t;
-typedef ::int8_t   int8_t;
-typedef ::uint16_t uint16_t;
-typedef ::int16_t  int16_t;
-typedef ::uint32_t uint32_t;
-typedef ::int32_t  int32_t;
-typedef ::uint64_t uint64_t;
-typedef ::int64_t  int64_t;
-}
-}
-#endif
-
 #endif // EIGEN_META_H
-
-// clang-format on
diff --git a/patches/eigen/Tensor b/patches/eigen/Tensor
deleted file mode 100644
index 1f1016f9b443c0d6c9cd56400d2126cdff9a1816..0000000000000000000000000000000000000000
--- a/patches/eigen/Tensor
+++ /dev/null
@@ -1,156 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-//#ifndef EIGEN_CXX11_TENSOR_MODULE
-//#define EIGEN_CXX11_TENSOR_MODULE
-
-#include "../../../Eigen/Core"
-
-#if EIGEN_HAS_CXX11
-
-#include "../SpecialFunctions"
-
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
-#include "src/util/CXX11Meta.h"
-#include "src/util/MaxSizeVector.h"
-
-/** \defgroup CXX11_Tensor_Module Tensor Module
-  *
-  * This module provides a Tensor class for storing arbitrarily indexed
-  * objects.
-  *
-  * \code
-  * #include <Eigen/CXX11/Tensor>
-  * \endcode
-  *
-  * Much of the documentation can be found \ref eigen_tensors "here".
-  */
-
-#include <cmath>
-#include <cstddef>
-#include <cstring>
-#include <random>
-
-#ifdef _WIN32
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-#include <windows.h>
-#else
-#include <stdint.h>
-#include <unistd.h>
-#endif
-
-#ifdef _WIN32
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
-
-#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
-#include "ThreadPool"
-#endif
-
-#ifdef EIGEN_USE_GPU
-  #include <iostream>
-  #if defined(EIGEN_USE_HIP)
-    #include <hip/hip_runtime.h>
-  #else
-    #include <cuda_runtime.h>
-  #endif
-  #include <atomic>
-#endif
-
-#include "src/Tensor/TensorMacros.h"
-#include "src/Tensor/TensorForwardDeclarations.h"
-#include "src/Tensor/TensorMeta.h"
-#include "src/Tensor/TensorFunctors.h"
-#include "src/Tensor/TensorCostModel.h"
-#include "src/Tensor/TensorDeviceDefault.h"
-#include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceGpu.h"
-#ifndef gpu_assert
-#define gpu_assert(x)
-#endif
-#include "src/Tensor/TensorDeviceSycl.h"
-#include "src/Tensor/TensorIndexList.h"
-#include "src/Tensor/TensorDimensionList.h"
-#include "src/Tensor/TensorDimensions.h"
-#include "src/Tensor/TensorInitializer.h"
-#include "src/Tensor/TensorTraits.h"
-#include "src/Tensor/TensorRandom.h"
-#include "src/Tensor/TensorUInt128.h"
-#include "src/Tensor/TensorIntDiv.h"
-#include "src/Tensor/TensorGlobalFunctions.h"
-
-#include "src/Tensor/TensorBase.h"
-#include "src/Tensor/TensorBlock.h"
-
-#include "src/Tensor/TensorEvaluator.h"
-#include "src/Tensor/TensorExpr.h"
-#include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionGpu.h"
-#include "src/Tensor/TensorArgMax.h"
-#include "src/Tensor/TensorConcatenation.h"
-#include "src/Tensor/TensorContractionMapper.h"
-#include "src/Tensor/TensorContractionBlocking.h"
-#include "src/Tensor/TensorContraction.h"
-#include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionGpu.h"
-#include "src/Tensor/TensorConversion.h"
-#include "src/Tensor/TensorConvolution.h"
-#include "src/Tensor/TensorFFT.h"
-#include "src/Tensor/TensorPatch.h"
-#include "src/Tensor/TensorImagePatch.h"
-#include "src/Tensor/TensorVolumePatch.h"
-#include "src/Tensor/TensorBroadcasting.h"
-#include "src/Tensor/TensorChipping.h"
-#include "src/Tensor/TensorInflation.h"
-#include "src/Tensor/TensorLayoutSwap.h"
-#include "src/Tensor/TensorMorphing.h"
-#include "src/Tensor/TensorPadding.h"
-#include "src/Tensor/TensorReverse.h"
-#include "src/Tensor/TensorShuffling.h"
-#include "src/Tensor/TensorStriding.h"
-#include "src/Tensor/TensorCustomOp.h"
-#include "src/Tensor/TensorEvalTo.h"
-#include "src/Tensor/TensorForcedEval.h"
-#include "src/Tensor/TensorGenerator.h"
-#include "src/Tensor/TensorAssign.h"
-#include "src/Tensor/TensorScan.h"
-#include "src/Tensor/TensorTrace.h"
-
-#ifdef EIGEN_USE_SYCL
-#include "src/Tensor/TensorReductionSycl.h"
-#include "src/Tensor/TensorConvolutionSycl.h"
-#include "src/Tensor/TensorContractionSycl.h"
-#include "src/Tensor/TensorScanSycl.h"
-#endif
-
-#include "src/Tensor/TensorExecutor.h"
-#include "src/Tensor/TensorDevice.h"
-
-#include "src/Tensor/TensorStorage.h"
-#include "src/Tensor/Tensor.h"
-#include "src/Tensor/TensorFixedSize.h"
-#include "src/Tensor/TensorMap.h"
-#include "src/Tensor/TensorRef.h"
-
-#include "src/Tensor/TensorIO.h"
-
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif  // EIGEN_HAS_CXX11
-//#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/patches/eigen/TensorBlock.h b/patches/eigen/TensorBlock.h
deleted file mode 100644
index 1e55d12c42fc2ece4035b743915eef10d996ea0a..0000000000000000000000000000000000000000
--- a/patches/eigen/TensorBlock.h
+++ /dev/null
@@ -1,1559 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-
-namespace Eigen {
-namespace internal {
-
-// -------------------------------------------------------------------------- //
-// Forward declarations for templates defined below.
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO;
-
-// -------------------------------------------------------------------------- //
-// Helper function to compute strides for densely stored buffer of given
-// dimensions.
-
-// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
-// this function instead everywhere.
-template <int Layout, typename IndexType, int NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const DSizes<IndexType, NumDims>& dimensions) {
-  DSizes<IndexType, NumDims> strides;
-  if (NumDims == 0) return strides;
-
-  // TODO(ezhulenev): Use templates to unroll this loop (similar to
-  // h_array_reduce in CXX11meta.h)? Benchmark it.
-  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-    strides[0] = 1;
-    for (int i = 1; i < NumDims; ++i) {
-      strides[i] = strides[i - 1] * dimensions[i - 1];
-    }
-  } else {
-    strides[NumDims - 1] = 1;
-    for (int i = NumDims - 2; i >= 0; --i) {
-      strides[i] = strides[i + 1] * dimensions[i + 1];
-    }
-  }
-
-  return strides;
-}
-
-template <int Layout, typename IndexType, size_t NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const Eigen::array<IndexType, NumDims>& dimensions) {
-  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
-}
-
-template <int Layout, std::ptrdiff_t... Indices>
-EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
-    const Sizes<Indices...>& sizes) {
-  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
-}
-
-// -------------------------------------------------------------------------- //
-
-// Tensor block shape type defines what are the shape preference for the blocks
-// extracted from the larger tensor.
-//
-// Example: blocks of 100 elements from the large 100x100 tensor:
-// - tensor: 100x100
-// - target_block_size: 100
-//
-// TensorBlockShapeType:
-//  - kUniformAllDims: 100 blocks of size 10x10
-//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
-//                      or row major layout)
-enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
-
-struct TensorBlockResourceRequirements {
-  TensorBlockShapeType shape_type;  // target block shape
-  size_t size;                      // target block size
-  TensorOpCost cost_per_coeff;      // cost of computing a single block element
-
-#ifdef EIGEN_HIPCC
-  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
-  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
-  // errors out complaining about the lack of a matching constructor
-  EIGEN_DEVICE_FUNC
-  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
-				  TensorOpCost cost_)
-    : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
-  {}
-#endif
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes,
-      TensorOpCost cost) {
-    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
-    return {shape_type, size, cost};
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes) {
-    // This default cost per coefficient is valid for most materialized tensor
-    // block evaluation implementations, because they typically just read
-    // coefficients from the underlying tensor storage, and write to the tensor
-    // block buffer (scratch or destination memory, reads and writes have linear
-    // access pattern). We ignore the fixed cost of block evaluation, because in
-    // practice it should negligible.
-    //
-    // Lazy block evaluation adds the cost of calling a functor for each
-    // coefficient.
-    //
-    // All non-trivial block evaluation implementations must provide their own
-    // cost approximation (e.g. shuffling inner dimension has a much higher cost
-    // because it reads memory randomly, although the total number of moved
-    // bytes is the same).
-    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
-                                    {/*bytes_loaded=*/sizeof(Scalar),
-                                     /*bytes_stored=*/sizeof(Scalar),
-                                     /*compute_cycles=*/0});
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
-                                    size_in_bytes);
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
-                                    size_in_bytes);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
-  merge(const TensorBlockResourceRequirements& lhs,
-        const TensorBlockResourceRequirements& rhs) {
-    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
-            merge(lhs.size, rhs.size),                       // size
-            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
-  }
-
-  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
-      TensorOpCost cost) {
-    cost_per_coeff += cost;
-    return *this;
-  }
-
-  // This is a resource requirement that should be returned from expressions
-  // that do not have any block evaluation preference (e.g. default tensor
-  // expression with raw buffer access).
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
-    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
-  }
-
- private:
-  using Requirements = TensorBlockResourceRequirements;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
-    return numext::maxi(lhs_size, rhs_size);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockShapeType
-  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
-    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
-            rhs == TensorBlockShapeType::kSkewedInnerDims)
-               ? TensorBlockShapeType::kSkewedInnerDims
-               : TensorBlockShapeType::kUniformAllDims;
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
-                                                TensorOpCost rhs_cost) {
-    return lhs_cost + rhs_cost;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockDescriptor specifies a block offset within a tensor and the block
-// sizes along each of the tensor dimensions.
-
-template <int NumDims, typename IndexType = Eigen::Index>
-class TensorBlockDescriptor {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  // If we evaluate a Tensor assignment, and expression on the left, already has
-  // a memory buffer, then we might do performance optimization, and evaluate
-  // the root expression directly into the final output memory. Some time it's
-  // possible to reuse it for materializing subexpressions inside an expression
-  // tree, to to avoid dynamic memory allocation.
-  //
-  // The pointer type of the underlying storage is erased, because passing
-  // Scalar type through all the expression evaluation layers is way too many
-  // templates. In practice destination buffer type should always match the
-  // evaluated expression scalar type.
-  class DestinationBuffer {
-   public:
-    enum DestinationBufferKind : int {
-      // The above explicit specification of "int" as the enum basetype is
-      // needed to get around a HIPCC link error ("the field type is not
-      // amp-compatible")
-      // which is issued for class members with the enum type.
-      // TODO(rocm):
-      // remove the "int" basetype once HIPCC has been fixed to not error out
-      // in the above scenario.
-
-      // Destination buffer is not defined (`m_data` == nullptr).
-      kEmpty,
-
-      // Tensor block defined by an owning tensor block descriptor can fit
-      // contiguously into the destination buffer. In this case it's safe to
-      // materialize tensor block in the destination buffer, wrap it in a
-      // TensorMap, and use to build Eigen expression on top of it.
-      kContiguous,
-
-      // Destination buffer strides do not match strides of the contiguously
-      // stored block, and it's impossible to define a TensorMap over this
-      // buffer. However if we are evaluating a root of an expression tree, we
-      // still can materialize an output into this destination, because we can
-      // guarantee that no one will ever access it through block API.
-      //
-      // In theory it is possible to build valid TensorStriding<TensorMap>
-      // expression on top of this destination buffer, however it has
-      // inefficient coeff/packet access, and defeats the purpose of fast block
-      // evaluation API.
-      kStrided
-    };
-
-    template <typename Scalar>
-    Scalar* data() const {
-      eigen_assert(m_data_type_size == sizeof(Scalar));
-      return static_cast<Scalar*>(m_data);
-    }
-
-    const Dimensions& strides() const { return m_strides; }
-    const DestinationBufferKind& kind() const { return m_kind; }
-
-   private:
-    friend class TensorBlockDescriptor;
-
-    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
-
-    template <typename Scalar>
-    DestinationBuffer(Scalar* data, const Dimensions& strides,
-                      DestinationBufferKind kind)
-        : m_data(static_cast<void*>(data)),
-          m_data_type_size(sizeof(Scalar)),
-          m_strides(strides),
-          m_kind(kind) {}
-
-    template <int Layout, typename Scalar>
-    static DestinationBuffer make(const TensorBlockDescriptor& desc,
-                                  Scalar* data, const Dimensions& strides) {
-      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
-    }
-
-    template <int Layout>
-    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
-                                      const Dimensions& strides) {
-      const Dimensions& desc_dims = desc.dimensions();
-      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
-      for (int i = 0; i < NumDims; ++i) {
-        if (desc_dims[i] == 1) continue;
-        if (desc_strides[i] != strides[i]) return kStrided;
-      }
-      return kContiguous;
-    }
-
-    // Storage pointer is type erased, to reduce template bloat, but we still
-    // keep the size of the underlying element type for error checking.
-    void* m_data;
-    size_t m_data_type_size;
-
-    // Destination buffer dimensions always match the dimensions of a tensor
-    // block descriptor it belongs to, however strides might be different.
-    Dimensions m_strides;
-
-    DestinationBufferKind m_kind;
-  };
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
-                        const DestinationBuffer& destination)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(destination) {}
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(DestinationBuffer()) {}
-
-  IndexType offset() const { return m_offset; }
-  const Dimensions& dimensions() const { return m_dimensions; }
-  IndexType dimension(int index) const { return m_dimensions[index]; }
-  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
-
-  const DestinationBuffer& destination() const { return m_destination; }
-
-  template <int Layout, typename Scalar>
-  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
-    eigen_assert(dst_base != NULL);
-    m_destination =
-        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
-  }
-
-  template <int Layout, typename Scalar, typename DstStridesIndexType>
-  void AddDestinationBuffer(
-      Scalar* dst_base,
-      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
-    // DSizes constructor will do index type promotion if it's safe.
-    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
-  }
-
-  TensorBlockDescriptor& DropDestinationBuffer() {
-    m_destination.m_data = NULL;
-    m_destination.m_kind = DestinationBuffer::kEmpty;
-    return *this;
-  }
-
-  bool HasDestinationBuffer() const {
-    return m_destination.kind() != DestinationBuffer::kEmpty;
-  }
-
-  // Returns a copy of `*this` with updated offset.
-  TensorBlockDescriptor WithOffset(IndexType offset) const {
-    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
-  }
-
- private:
-  // Offset and dimensions are immutable after construction. Block descriptor
-  // can only be mutated by adding or dropping destination.
-  const IndexType m_offset;
-  const Dimensions m_dimensions;
-  DestinationBuffer m_destination;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
-
-template <int NumDims, int Layout, typename IndexType = Eigen::Index>
-class TensorBlockMapper {
-  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  TensorBlockMapper() = default;
-  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
-                    const TensorBlockResourceRequirements& requirements)
-      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
-    // Compute block dimensions and the total number of blocks.
-    InitializeBlockDimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
-    return m_total_block_count;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
-    return m_block_dimensions.TotalSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
-  blockDimensions() const {
-    return m_block_dimensions;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
-  blockDescriptor(IndexType block_index) const {
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    IndexType offset = 0;
-    DSizes<IndexType, NumDims> dimensions;
-
-    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
-
-    // Iterate outer -> inner dimensions.
-    for (int i = NumDims - 1; i >= 0; --i) {
-      const int dim = isColMajor ? i : NumDims - i - 1;
-
-      const IndexType idx = block_index / m_block_strides[dim];
-      block_index -= idx * m_block_strides[dim];
-
-      const IndexType coord = idx * m_block_dimensions[dim];
-      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
-                                     m_block_dimensions[dim]);
-      offset += coord * m_tensor_strides[dim];
-    }
-
-    return {offset, dimensions};
-  }
-
- private:
-  void InitializeBlockDimensions() {
-    // Requested block shape and size.
-    const TensorBlockShapeType shape_type = m_requirements.shape_type;
-    IndexType target_block_size =
-        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
-
-    IndexType tensor_size = m_tensor_dimensions.TotalSize();
-
-    // Corner case: one of the dimensions is zero. Logic below is too complex
-    // to handle this case on a general basis, just use unit block size.
-    // Note: we must not yield blocks with zero dimensions (recipe for
-    // overflows/underflows, divisions by zero and NaNs later).
-    if (tensor_size == 0) {
-      for (int i = 0; i < NumDims; ++i) {
-        m_block_dimensions[i] = 1;
-      }
-      m_total_block_count = 0;
-      return;
-    }
-
-    // If tensor fits into a target block size, evaluate it as a single block.
-    if (tensor_size <= target_block_size) {
-      m_block_dimensions = m_tensor_dimensions;
-      m_total_block_count = 1;
-      // The only valid block index is `0`, and in this case we do not need
-      // to compute real strides for tensor or blocks (see blockDescriptor).
-      for (int i = 0; i < NumDims; ++i) {
-        m_tensor_strides[i] = 0;
-        m_block_strides[i] = 1;
-      }
-      return;
-    }
-
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    // Block shape skewed towards inner dimension.
-    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
-      IndexType coeff_to_allocate = target_block_size;
-
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-        m_block_dimensions[dim] =
-            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
-        coeff_to_allocate = divup(
-            coeff_to_allocate,
-            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
-      }
-      eigen_assert(coeff_to_allocate == 1);
-
-    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
-      // Tensor will not fit within 'target_block_size' budget: calculate tensor
-      // block dimension sizes based on "square" dimension size target.
-      const IndexType dim_size_target = convert_index<IndexType>(
-          std::pow(static_cast<float>(target_block_size),
-                   1.0f / static_cast<float>(m_block_dimensions.rank())));
-
-      for (int i = 0; i < NumDims; ++i) {
-        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
-        // a multiple of the packet size. Note that reducing
-        // 'block_dim_size' in this manner can increase the number of
-        // blocks, and so will amplify any per-block overhead.
-        m_block_dimensions[i] =
-            numext::mini(dim_size_target, m_tensor_dimensions[i]);
-      }
-
-      // Add any un-allocated coefficients to inner dimension(s).
-      IndexType total_size = m_block_dimensions.TotalSize();
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-
-        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
-          const IndexType total_size_other_dims =
-              total_size / m_block_dimensions[dim];
-          const IndexType alloc_avail =
-              divup<IndexType>(target_block_size, total_size_other_dims);
-          if (alloc_avail == m_block_dimensions[dim]) {
-            // Insufficient excess coefficients to allocate.
-            break;
-          }
-          m_block_dimensions[dim] =
-              numext::mini(m_tensor_dimensions[dim], alloc_avail);
-          total_size = total_size_other_dims * m_block_dimensions[dim];
-        }
-      }
-
-    } else {
-      eigen_assert(false);  // unknown block shape
-    }
-
-    eigen_assert(m_block_dimensions.TotalSize() >=
-                 numext::mini<IndexType>(target_block_size,
-                                         m_tensor_dimensions.TotalSize()));
-
-    // Calculate block counts by dimension and total block count.
-    DSizes<IndexType, NumDims> block_count;
-    for (int i = 0; i < NumDims; ++i) {
-      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
-    }
-    m_total_block_count = array_prod(block_count);
-
-    // Calculate block strides (used for enumerating blocks).
-    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
-    m_block_strides = strides<Layout>(block_count);
-  }
-
-  DSizes<IndexType, NumDims> m_tensor_dimensions;
-  TensorBlockResourceRequirements m_requirements;
-
-  DSizes<IndexType, NumDims> m_block_dimensions;
-  IndexType m_total_block_count;
-
-  DSizes<IndexType, NumDims> m_tensor_strides;
-  DSizes<IndexType, NumDims> m_block_strides;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockScratchAllocator is responsible for allocating temporary buffers
-// for block evaluation (output or input block materialization). Given that
-// Eigen expression traversal order is deterministic, all temporary allocations
-// are happening in the same order, and usually have exactly the same size.
-// Scratch allocator keeps a trace of all dynamic allocations, and after the
-// first block evaluation is completed, we should be able to reuse all the
-// temporary buffers for the next block evaluation.
-
-template <typename Device>
-class TensorBlockScratchAllocator {
- public:
-  explicit TensorBlockScratchAllocator(const Device& device)
-      : m_device(device), m_allocation_index(0) {}
-
-  ~TensorBlockScratchAllocator() {
-    for (size_t i = 0; i < m_allocations.size(); ++i) {
-      m_device.deallocate(m_allocations[i].ptr);
-    }
-  }
-
-  void* allocate(size_t size) {
-    // TODO(ezhulenev): Remove when replaced with inlined vector.
-    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
-
-    // Check if we already have an existing allocation att current index.
-    const int num_allocations = static_cast<int>(m_allocations.size());
-    const bool has_allocation = m_allocation_index < num_allocations;
-
-    // Allocation index can't be larger than the number of allocations.
-    eigen_assert(m_allocation_index <= num_allocations);
-
-    // If we have existing allocation, and its size is larger or equal to
-    // requested size, we do nothing.
-
-    // If current allocation can't fit requested size, we deallocate it, and
-    // replace with a larger allocation.
-    if (has_allocation && m_allocations[m_allocation_index].size < size) {
-      m_device.deallocate(m_allocations[m_allocation_index].ptr);
-      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
-      m_allocations[m_allocation_index].size = size;
-    }
-
-    // Make a new allocation if we don't have and existing one.
-    if (!has_allocation) {
-      Allocation allocation;
-      allocation.ptr = m_device.allocate(size);
-      allocation.size = size;
-      m_allocations.push_back(allocation);
-    }
-
-    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
-    eigen_assert(m_allocations[m_allocation_index].size >= size);
-
-    return m_allocations[m_allocation_index++].ptr;
-  }
-
-  void reset() { m_allocation_index = 0; }
-
- private:
-  struct Allocation {
-    void* ptr;
-    size_t size;
-  };
-
-  const Device& m_device;
-  int m_allocation_index;
-  // TODO(ezhulenev): This should be an inlined vector.
-  std::vector<Allocation> m_allocations;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockKind represents all possible block kinds, that can be produced by
-// TensorEvaluator::evalBlock function.
-enum TensorBlockKind {
-  // Tensor block that is a lazy expression that must be assigned to a
-  // destination using TensorBlockAssign.
-  kExpr,
-
-  // Tensor block that is a view into a memory buffer owned by an underlying
-  // Tensor expression (e.g. it can be a view into a Tensor buffer).
-  kView,
-
-  // Tensor block that was materialized in a scratch memory buffer, allocated
-  // with TensorBlockScratchAllocator. This block must be copied to a
-  // destination, similar to a block of `kExpr` type.
-  kMaterializedInScratch,
-
-  // Tensor block that was materialized directly into the final output memory
-  // buffer. For example if the left side of an assignment is a Tensor, we can
-  // directly materialize the block in the destination memory.
-  //
-  // If strides in the output buffer do not match tensor block strides, the
-  // Tensor expression will be invalid, and should not be used by
-  // TensorBlockAssign or for constructing another block expression.
-  kMaterializedInOutput
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
-// TensorEvaluators that do not support block evaluation.
-
-class TensorBlockNotImplemented {
- public:
-  typedef void XprType;
-};
-
-// -------------------------------------------------------------------------- //
-// XprScalar extracts Scalar type from the Eigen expressions (if expression type
-// is not void). It's required to be able to define lazy block expression for
-// argument types, that do not support block evaluation.
-
-template <typename XprType>
-struct XprScalar {
-  typedef typename XprType::Scalar type;
-};
-template <>
-struct XprScalar<void> {
-  typedef void type;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorMaterializedBlock is a fully evaluated block of the original tensor,
-// and XprType is just a TensorMap over the data. This block type is typically
-// used to materialize blocks of tensor expressions, that can't be efficiently
-// represented as lazy Tensor expressions with fast coeff/packet operations,
-// e.g. we materialize all broadcasts into evaluated blocks.
-//
-// TensorMaterializedBlock does not own its memory buffer, it's either a memory
-// buffer that backs the original expression (e.g. block is just a view into a
-// Tensor), or a memory buffer allocated with scratch allocator, and in this
-// case the scratch allocator will deallocate it at the end of block based
-// expression execution.
-//
-// If the block was evaluated directly into the output buffer, and strides in
-// the output buffer do not match block strides, the TensorMap expression will
-// be invalid, and should never be used in block assignment or any other tensor
-// expression.
-
-template <typename Scalar, int NumDims, int Layout,
-          typename IndexType = Eigen::Index>
-class TensorMaterializedBlock {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
-
-  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
-                          const Dimensions& dimensions, bool valid_expr = true)
-      : m_kind(kind),
-        m_data(data),
-        m_dimensions(dimensions),
-        m_expr(m_data, m_dimensions),
-        m_valid_expr(valid_expr) {
-    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
-  }
-
-  TensorBlockKind kind() const { return m_kind; }
-  // NOTE(ezhulenev): Returning XprType by value like in other block types
-  // causes asan failures. The theory is that XprType::Nested doesn't work
-  // properly for TensorMap.
-  const XprType& expr() const {
-    eigen_assert(m_valid_expr);
-    return m_expr;
-  }
-  const Scalar* data() const { return m_data; }
-  void cleanup() {}
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
-
-  // TensorMaterializedBlock can be backed by different types of storage:
-  //
-  //   (1) Contiguous block of memory allocated with scratch allocator.
-  //   (2) Contiguous block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //   (3) Strided block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //
-  class Storage {
-   public:
-    Scalar* data() const { return m_data; }
-    const Dimensions& dimensions() const { return m_dimensions; }
-    const Dimensions& strides() const { return m_strides; }
-
-    TensorMaterializedBlock AsTensorMaterializedBlock() const {
-      return TensorMaterializedBlock(
-          m_materialized_in_output
-              ? internal::TensorBlockKind::kMaterializedInOutput
-              : internal::TensorBlockKind::kMaterializedInScratch,
-          m_data, m_dimensions, !m_strided_storage);
-    }
-
-   private:
-    friend class TensorMaterializedBlock;
-
-    Storage(Scalar* data, const Dimensions& dimensions,
-            const Dimensions& strides, bool materialized_in_output,
-            bool strided_storage)
-        : m_data(data),
-          m_dimensions(dimensions),
-          m_strides(strides),
-          m_materialized_in_output(materialized_in_output),
-          m_strided_storage(strided_storage) {}
-
-    Scalar* m_data;
-    Dimensions m_dimensions;
-    Dimensions m_strides;
-    bool m_materialized_in_output;
-    bool m_strided_storage;
-  };
-
-  // Creates a storage for materialized block either from the block descriptor
-  // destination buffer, or allocates a new buffer with scratch allocator.
-  template <typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static Storage prepareStorage(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch,
-      bool allow_strided_storage = false) {
-    // Try to reuse destination as an output block buffer.
-    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
-
-    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/true,
-                     /*strided_storage=*/false);
-
-    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
-               allow_strided_storage) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
-                     /*materialized_in_output=*/true, /*strided_storage=*/true);
-
-    } else {
-      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
-      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/false,
-                     /*strided_storage=*/false);
-    }
-  }
-
-  // Creates a materialized block for the given descriptor from a memory buffer.
-  template <typename DataDimensions, typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
-      const Scalar* data, const DataDimensions& data_dims,
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
-
-    // If a tensor block dimensions covers a contiguous block of the underlying
-    // memory, we can skip block buffer memory allocation, and construct a block
-    // from existing `data` memory buffer.
-    //
-    // Example: (RowMajor layout)
-    //   data_dims:          [11, 12, 13, 14]
-    //   desc.dimensions():  [1,   1,  3, 14]
-    //
-    // In this case we can construct a TensorBlock starting at
-    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Find out how many inner dimensions have a matching size.
-    int num_matching_inner_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (data_dims[dim] != desc.dimensions()[dim]) break;
-      ++num_matching_inner_dims;
-    }
-
-    // All the outer dimensions must be of size `1`, except a single dimension
-    // before the matching inner dimension (`3` in the example above).
-    bool can_use_direct_access = true;
-    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (desc.dimension(dim) != 1) {
-        can_use_direct_access = false;
-        break;
-      }
-    }
-
-    if (can_use_direct_access) {
-      const Scalar* block_start = data + desc.offset();
-      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
-                                     block_start, desc.dimensions());
-
-    } else {
-      // Reuse destination buffer or allocate new buffer with scratch allocator.
-      const Storage storage = prepareStorage(desc, scratch);
-
-      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
-          TensorBlockIO;
-      typedef typename TensorBlockIO::Dst TensorBlockIODst;
-      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
-                           data, desc.offset());
-      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
-                           storage.data());
-
-      TensorBlockIO::Copy(dst, src);
-      return storage.AsTensorMaterializedBlock();
-    }
-  }
-
- private:
-  TensorBlockKind m_kind;
-  const Scalar* m_data;
-  Dimensions m_dimensions;
-  XprType m_expr;
-  bool m_valid_expr;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename UnaryOp, typename ArgTensorBlock>
-class TensorCwiseUnaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename ArgTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
-      type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
-      : m_arg_block(arg_block), m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  UnaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
-class TensorCwiseBinaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename LhsTensorBlock::XprType>::value ||
-      internal::is_void<typename RhsTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
-                          const typename RhsTensorBlock::XprType> >::type
-      XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
-                         const RhsTensorBlock& right_block,
-                         const BinaryOp& functor)
-      : m_left_block(left_block),
-        m_right_block(right_block),
-        m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const {
-    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
-  }
-
-  const Scalar* data() const { return NULL; }
-
-  void cleanup() {
-    m_left_block.cleanup();
-    m_right_block.cleanup();
-  }
-
- private:
-  LhsTensorBlock m_left_block;
-  RhsTensorBlock m_right_block;
-  BinaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorUnaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from a block of the underlying type (this is a
-// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
-
-template <typename BlockFactory, typename ArgTensorBlock>
-class TensorUnaryExprBlock {
-  typedef typename ArgTensorBlock::XprType ArgXprType;
-  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
-                       const BlockFactory& factory)
-      : m_arg_block(arg_block), m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorTernaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from three blocks of the underlying type.
-
-template <typename BlockFactory, typename Arg1TensorBlock,
-          typename Arg2TensorBlock, typename Arg3TensorBlock>
-class TensorTernaryExprBlock {
-  typedef typename Arg1TensorBlock::XprType Arg1XprType;
-  typedef typename Arg2TensorBlock::XprType Arg2XprType;
-  typedef typename Arg3TensorBlock::XprType Arg3XprType;
-
-  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
-                                       internal::is_void<Arg2XprType>::value ||
-                                       internal::is_void<Arg3XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
-                                              Arg3XprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
-                         const Arg2TensorBlock& arg2_block,
-                         const Arg3TensorBlock& arg3_block,
-                         const BlockFactory& factory)
-      : m_arg1_block(arg1_block),
-        m_arg2_block(arg2_block),
-        m_arg3_block(arg3_block),
-        m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const {
-    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
-                          m_arg3_block.expr());
-  }
-  const Scalar* data() const { return NULL; }
-  void cleanup() {
-    m_arg1_block.cleanup();
-    m_arg2_block.cleanup();
-    m_arg3_block.cleanup();
-  }
-
- private:
-  Arg1TensorBlock m_arg1_block;
-  Arg2TensorBlock m_arg2_block;
-  Arg3TensorBlock m_arg3_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// StridedLinearBufferCopy provides a method to copy data between two linear
-// buffers with different strides, with optimized paths for scatter/gather.
-
-template <typename Scalar, typename IndexType>
-class StridedLinearBufferCopy {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
- public:
-  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
-  enum class Kind {
-    Linear = 0,       // src_stride == 1 && dst_stride == 1
-    Scatter = 1,      // src_stride == 1 && dst_stride != 1
-    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
-    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
-    Gather = 4,       // dst_stride == 1
-    Random = 5        // everything else
-  };
-
-  struct Dst {
-    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    Scalar* data;
-  };
-
-  struct Src {
-    Src(IndexType o, IndexType s, const Scalar* d)
-        : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    const Scalar* data;
-  };
-
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
-                                                        const Src& src,
-                                                        const size_t count) {
-    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
-              src.data);
-  }
-
- private:
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const IndexType count, const IndexType dst_offset,
-      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const IndexType src_offset, const IndexType src_stride,
-      const Scalar* EIGEN_RESTRICT src_data) {
-    const Scalar* src = &src_data[src_offset];
-    Scalar* dst = &dst_data[dst_offset];
-
-    if (!Vectorizable) {
-      for (Index i = 0; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-      return;
-    }
-
-    const IndexType vectorized_size = count - PacketSize;
-    IndexType i = 0;
-
-    if (kind == StridedLinearBufferCopy::Kind::Linear) {
-      // ******************************************************************** //
-      // Linear copy from `src` to `dst`.
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      eigen_assert(src_stride == 1 && dst_stride == 1);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          Packet p = ploadu<Packet>(src + i + j * PacketSize);
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
-      // Scatter from `src` to `dst`.
-      eigen_assert(src_stride == 1 && dst_stride != 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
-      // Fill `dst` with value at `*src`.
-      eigen_assert(src_stride == 0 && dst_stride == 1);
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      Packet p = pload1<Packet>(src);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
-      // Scatter `*src` into `dst`.
-      eigen_assert(src_stride == 0 && dst_stride != 1);
-      Packet p = pload1<Packet>(src);
-      for (; i <= vectorized_size; i += PacketSize) {
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
-      // Gather from `src` into `dst`.
-      eigen_assert(dst_stride == 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i * src_stride];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
-      // Random.
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-    } else {
-      eigen_assert(false);
-    }
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
-// It's possible to specify src->dst dimension mapping for the copy operation.
-// Dimensions of `dst` specify how many elements have to be copied, for the
-// `src` we need to know only stride to navigate through source memory buffer.
-
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO {
-  static const bool IsColMajor = (Layout == ColMajor);
-
-  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef DSizes<int, NumDims> DimensionsMap;
-
-  struct Dst {
-    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
-        IndexType dst_offset = 0)
-        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  struct Src {
-    Src(const Dimensions& src_strides, const Scalar* src,
-        IndexType src_offset = 0)
-        : strides(src_strides), data(src), offset(src_offset) {}
-
-    Dimensions strides;
-    const Scalar* data;
-    IndexType offset;
-  };
-
-  // Copies data to `dst` from `src`, using provided dimensions mapping:
-  //
-  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
-  //
-  // Returns the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
-      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
-    // Copy single scalar value from `src` to `dst`.
-    if (NumDims == 0) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Both `dst` and `src` must have contiguous innermost dimension. We also
-    // accept the special case with stride '0', because it's used as a trick to
-    // implement broadcasting.
-    {
-      int inner_dim = IsColMajor ? 0 : NumDims - 1;
-      EIGEN_UNUSED_VARIABLE(inner_dim);
-      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
-      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
-    }
-
-    // Give a shorter name to `dst_to_src_dim_map`.
-    const DimensionsMap& dim_map = dst_to_src_dim_map;
-
-    // Do not squeeze reordered inner dimensions.
-    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
-
-    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
-    // block, and we write data linearly into that dimension, reading it from
-    // the src. If dimensions are reordered, we might end up reading data from
-    // the src with `stride != 1`.
-    //
-    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
-    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
-
-    // Find the innermost dimension in the dst whose size is not 1. This is the
-    // effective inner dim.
-    int num_size_one_inner_dims = 0;
-    for (int i = 0; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      if (dst.dims[dst_dim] != 1) break;
-      num_size_one_inner_dims++;
-    }
-
-    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
-    if (num_size_one_inner_dims == NumDims) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
-    const int dst_stride1_dim = IsColMajor
-                                    ? num_size_one_inner_dims
-                                    : NumDims - num_size_one_inner_dims - 1;
-
-    // Dimension in the src that corresponds to the dst innermost dimension.
-    const int src_dim_for_dst_stride1_dim =
-        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
-
-    // Size of the innermost dimension (length of contiguous blocks of memory).
-    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
-
-    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
-    // `src` memory, so we can do less linear copy calls.
-    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      const IndexType dst_stride = dst.strides[dst_dim];
-      const IndexType src_stride = src.strides[dim_map[dst_dim]];
-      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
-        dst_inner_dim_size *= dst.dims[dst_dim];
-        ++num_size_one_inner_dims;
-      } else {
-        break;
-      }
-    }
-
-    // Setup strides to read data from `src` and write to `dst`.
-    IndexType input_offset = src.offset;
-    IndexType output_offset = dst.offset;
-    IndexType input_stride =
-        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
-    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
-
-    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> it;
-
-    // Initialize block iterator state. Squeeze away any dimension of size 1.
-    int idx = 0;  // currently initialized iterator state index
-    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
-      if (dst.dims[dst_dim] == 1) continue;
-
-      it[idx].size = dst.dims[dst_dim];
-      it[idx].input_stride = src.strides[dim_map[dst_dim]];
-      it[idx].output_stride = dst.strides[dst_dim];
-
-      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-
-      idx++;
-    }
-
-    // Iterate copying data from src to dst.
-    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
-
-#define COPY_INNER_DIM(KIND)                                           \
-  IndexType num_copied = 0;                                            \
-  for (num_copied = 0; num_copied < block_total_size;                  \
-       num_copied += dst_inner_dim_size) {                             \
-    LinCopy::template Run<KIND>(                                       \
-        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
-        typename LinCopy::Src(input_offset, input_stride, src.data),   \
-        dst_inner_dim_size);                                           \
-                                                                       \
-    for (int j = 0; j < idx; ++j) {                                    \
-      if (++it[j].count < it[j].size) {                                \
-        input_offset += it[j].input_stride;                            \
-        output_offset += it[j].output_stride;                          \
-        break;                                                         \
-      }                                                                \
-      it[j].count = 0;                                                 \
-      input_offset -= it[j].input_span;                                \
-      output_offset -= it[j].output_span;                              \
-    }                                                                  \
-  }                                                                    \
-  return num_copied;
-
-    if (input_stride == 1 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Linear);
-    } else if (input_stride == 1 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Scatter);
-    } else if (input_stride == 0 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
-    } else if (input_stride == 0 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
-    } else if (output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Gather);
-    } else {
-      COPY_INNER_DIM(LinCopy::Kind::Random);
-    }
-
-#undef COPY_INNER_DIM
-  }
-
-  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
-  // the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
-                                                              const Src& src) {
-    DimensionsMap dst_to_src_map;
-    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
-    return Copy(dst, src, dst_to_src_map);
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : size(0),
-          count(0),
-          input_stride(0),
-          output_stride(0),
-          input_span(0),
-          output_span(0) {}
-
-    IndexType size;
-    IndexType count;
-    IndexType input_stride;
-    IndexType output_stride;
-    IndexType input_span;
-    IndexType output_span;
-  };
-
-  // Compute how many inner dimensions it's allowed to squeeze when doing IO
-  // between two tensor blocks. It's safe to squeeze inner dimensions, only
-  // if they are not reordered.
-  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
-    int num_squeezable_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      if (dim_map[dim] != dim) break;
-      num_squeezable_dims++;
-    }
-    return num_squeezable_dims;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
-// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
-//
-// Currently there is no way to write from a Tensor expression to a block of
-// memory, if dimensions are reordered. If you need to do that, you should
-// materialize a Tensor block expression into a memory buffer, and then use
-// TensorBlockIO to copy data between two memory buffers with a custom
-// `target->src` dimension map (see definition above).
-//
-// Also currently the innermost dimension of `target` must have a stride '1'
-// (contiguous in memory). This restriction could be lifted with a `pscatter`,
-// but in practice it's never needed, and there is a similar TensorBlockIO
-// workaround for that.
-//
-// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
-// where `src` is a tensor expression. Explore if it is possible to rewrite IO
-// to use expressions instead of pointers, and after that TensorBlockAssignment
-// will become an alias to IO.
-template <typename Scalar, int NumDims, typename TensorBlockExpr,
-          typename IndexType = Eigen::Index>
-class TensorBlockAssignment {
-  // We will use coeff/packet path to evaluate block expressions.
-  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
-      TensorBlockEvaluator;
-
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
-  template <bool Vectorizable, typename Evaluator>
-  struct InnerDimAssign {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      for (IndexType i = 0; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
-  template <typename Evaluator>
-  struct InnerDimAssign<true, Evaluator> {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      typedef typename packet_traits<Scalar>::type Packet;
-
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      const IndexType vectorized_size = count - PacketSize;
-      IndexType i = 0;
-
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          const IndexType idx = eval_offset + i + j * PacketSize;
-          Packet p = eval.template packet<Unaligned>(idx);
-          pstoreu<Scalar>(target + i + j * PacketSize, p);
-        }
-      }
-
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = eval.template packet<Unaligned>(eval_offset + i);
-        pstoreu<Scalar>(target + i, p);
-      }
-
-      for (; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
- public:
-  struct Target {
-    Target(const Dimensions& target_dims, const Dimensions& target_strides,
-           Scalar* target_data, IndexType target_offset = 0)
-        : dims(target_dims),
-          strides(target_strides),
-          data(target_data),
-          offset(target_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  static Target target(const Dimensions& target_dims,
-                       const Dimensions& target_strides, Scalar* target_data,
-                       IndexType target_offset = 0) {
-    return Target(target_dims, target_strides, target_data, target_offset);
-  }
-
-  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
-  static Target target(
-      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
-      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
-      Scalar* target_data, IndexType target_offset = 0) {
-    // DSizes constructor will do index type promotion if it's safe.
-    return Target(Dimensions(target_dims), Dimensions(target_strides),
-                  target_data, target_offset);
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Target& target, const TensorBlockExpr& expr) {
-    // Prepare evaluator for block expression.
-    DefaultDevice default_device;
-    TensorBlockEvaluator eval(expr, default_device);
-
-    // Tensor block expression dimension should match destination dimensions.
-    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
-
-    static const int Layout = TensorBlockEvaluator::Layout;
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Initialize output inner dimension size based on a layout.
-    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
-    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
-    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
-
-    // Target inner dimension stride must be '1'.
-    eigen_assert(target.strides[inner_dim_idx] == 1);
-
-    // Squeeze multiple inner dims into one if they are contiguous in `target`.
-    IndexType num_squeezed_dims = 0;
-    for (Index i = 1; i < NumDims; ++i) {
-      const Index dim = is_col_major ? i : NumDims - i - 1;
-      const IndexType target_stride = target.strides[dim];
-
-      if (output_inner_dim_size == target_stride) {
-        output_inner_dim_size *= target.dims[dim];
-        num_squeezed_dims++;
-      } else {
-        break;
-      }
-    }
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-
-    int idx = 0;  // currently initialized iterator state index
-    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
-      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
-
-      it[idx].count = 0;
-      it[idx].size = target.dims[dim];
-      it[idx].output_stride = target.strides[dim];
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-      idx++;
-    }
-
-    // We read block expression from the beginning, and start writing data to
-    // `target` at given offset.
-    IndexType input_offset = 0;
-    IndexType output_offset = target.offset;
-
-    // Iterate copying data from `eval` to `target`.
-    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
-      // Assign to `target` at current offset.
-      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
-                     TensorBlockEvaluator>::Run(target.data + output_offset,
-                                                output_inner_dim_size, eval,
-                                                input_offset);
-
-      // Move input offset forward by the number of assigned coefficients.
-      input_offset += output_inner_dim_size;
-
-      // Update index.
-      for (int j = 0; j < idx; ++j) {
-        if (++it[j].count < it[j].size) {
-          output_offset += it[j].output_stride;
-          break;
-        }
-        it[j].count = 0;
-        output_offset -= it[j].output_span;
-      }
-    }
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : count(0), size(0), output_stride(0), output_span(0) {}
-
-    IndexType count;
-    IndexType size;
-    IndexType output_stride;
-    IndexType output_span;
-  };
-};
-
-// -------------------------------------------------------------------------- //
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/patches/eigen/TensorReductionGpu.h b/patches/eigen/TensorReductionGpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..696078e54881afaa69566570d780541b9d383da6
--- /dev/null
+++ b/patches/eigen/TensorReductionGpu.h
@@ -0,0 +1,996 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// clang-format off
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+
+namespace Eigen {
+namespace internal {
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple gpu thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another gpu thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    gpu_assert(0 && "Wordsize not supported");
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
+template <template <typename T> class R>
+__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum,
+                                    R<half>& reducer) {
+  half2* houtput=reinterpret_cast<half2*>(output);
+  half2* haccum=reinterpret_cast<half2*>(&accum);
+  for(int i=0;i<4;++i){
+    atomicReduce(houtput+i,*(haccum+i),reducer);
+  }
+}
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  atomicAdd(output, accum);
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    // use std::is_floating_point to determine the type of reduced_val 
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+    // and list the float and int versions of __shfl_down as the candidate functions. 
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+    }
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+  #else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+#if defined(EIGEN_HIPCC)
+    __threadfence_system();
+#endif
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                                      packet_traits<Eigen::half>::type* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<packet_type>::size);
+  if (packet_remainder != 0) {
+    half2* h2scratch = reinterpret_cast<half2*>(scratch);
+    for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
+      *h2scratch =
+          __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
+      h2scratch++;
+    }
+    if ((num_coeffs & 1) != 0) {
+      half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
+      *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
+    }
+  } else {
+    *scratch = reducer.template initializePacket<packet_type>();
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+
+  const Index num_packets =
+      num_coeffs / Index(unpacket_traits<PacketType>::size);
+  PacketType* p_output = reinterpret_cast<PacketType*>(output);
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    p_output[i] = reducer.template initializePacket<PacketType>();
+  }
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<PacketType>::size);
+  if (thread_id < packet_remainder) {
+    output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, packet_traits<Eigen::half>::type* scratch) {
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  eigen_assert(NumPerThread % packet_width == 0);
+  const Index first_index =
+      blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      int rem = num_coeffs % packet_width;
+      if (rem != 0) {
+        half2* p_scratch = reinterpret_cast<half2*>(scratch);
+        *scratch = reducer.template initializePacket<PacketType>();
+        for (int i = 0; i < rem / 2; i++) {
+          *p_scratch = __halves2half2(
+              input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
+              input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
+          p_scratch++;
+        }
+        if ((num_coeffs & 1) != 0) {
+          half last = input.m_impl.coeff(num_coeffs - 1);
+          *p_scratch = __halves2half2(last, reducer.initialize());
+        }
+      } else {
+        *scratch = reducer.template initializePacket<PacketType>();
+      }
+    }
+    __syncthreads();
+  }
+
+  PacketType accum = reducer.template initializePacket<PacketType>();
+  const Index max_iter =
+      numext::mini<Index>((num_coeffs - first_index) / packet_width,
+                          NumPerThread * BlockSize / packet_width);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + packet_width * i;
+    eigen_assert(index + packet_width < num_coeffs);
+    PacketType val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+      union { int i; half2 h; } wka_in, wka_out;
+      wka_in.h = hacc[i];
+      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+      hr[i] = wka_out.h;
+    }
+    reducer.reducePacket(r1, &accum);
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down(hacc[i], offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+  #else
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  __syncthreads();
+  half2* rv1 = reinterpret_cast<half2*>(scratch);
+  if (packet_width > 2) {
+    reducer.reducePacket(rv1[2], rv1);
+    reducer.reducePacket(rv1[3], rv1 + 1);
+    reducer.reducePacket(rv1[1], rv1);
+  }
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      half tmp = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &tmp);
+      *output = tmp;
+    }
+  }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  half2* pscratch = reinterpret_cast<half2*>(scratch);
+  half tmp = __float2half(0.f);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
+    reducer.reduce(__low2half(*pscratch), &tmp);
+    reducer.reduce(__high2half(*pscratch), &tmp);
+    pscratch++;
+  }
+  *output = tmp;
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    gpu_assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+
+    typedef typename Self::Index Index;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+    typedef typename packet_traits<Eigen::half>::type PacketType;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
+    // half2* scratch = static_cast<half2*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+  }
+};
+
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        // use std::is_floating_point to determine the type of reduced_val 
+       // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+       // and list the float and int versions of __shfl_down as the candidate functions. 
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      #else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+      #endif
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  const int unroll_times = 16 / packet_width;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = packet_width * thread_id;
+    for (; i + packet_width <= num_preserved_coeffs;
+         i += packet_width * num_threads) {
+      PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
+      *poutput = reducer.template initializePacket<PacketType>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin =
+          packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
+      PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col =
+            col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + packet_width <= num_coeffs_to_reduce;
+               col += blockDim.x) {
+            const PacketType val1 = input.m_impl.template packet<Unaligned>(
+                row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const PacketType val2 = input.m_impl.template packet<Unaligned>(
+                (row + 1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            PacketType r1 = reducer.template initializePacket<PacketType>();
+            PacketType r2 = reducer.template initializePacket<PacketType>();
+            half2* hr1 = reinterpret_cast<half2*>(&r1);
+            half2* hr2 = reinterpret_cast<half2*>(&r2);
+            while (col + 1 < num_coeffs_to_reduce) {
+              *hr1 = __halves2half2(
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
+              *hr2 = __halves2half2(
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
+                                     1));
+              hr1++;
+              hr2++;
+              col += 2;
+            }
+            if (col < num_coeffs_to_reduce) {
+              // Peel;
+              const half last1 =
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+              *hr1 = __halves2half2(last1, reducer.initialize());
+              const half last2 =
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
+              *hr2 = __halves2half2(last2, reducer.initialize());
+            }
+            reducer.reducePacket(r1, &reduced_val1);
+            reducer.reducePacket(r2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * packet_width;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     row * num_coeffs_to_reduce + col),
+                                 &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     (row + 1) * num_coeffs_to_reduce + col),
+                                 &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+	  union { int i; half2 h; } wka_in1, wka_out1;
+	  wka_in1.h = rv1[i];
+	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+	  hr1[i] = wka_out1.h;
+
+	  union { int i; half2 h; } wka_in2, wka_out2;
+	  wka_in2.h = rv2[i];
+	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+	  hr2[i] = wka_out2.h;
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
+          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #else
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
+          hr2[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+
+      #endif
+      }
+      half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+      half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+      half2 val;
+      if (packet_width > 2) {
+        reducer.reducePacket(rv1[2], rv1);
+        reducer.reducePacket(rv1[3], rv1 + 1);
+        reducer.reducePacket(rv1[1], rv1);
+        reducer.reducePacket(rv2[2], rv2);
+        reducer.reducePacket(rv2[3], rv2 + 1);
+        reducer.reducePacket(rv2[1], rv2);
+      }
+      half val1 = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &val1);
+      half val2 = __low2half(*rv2);
+      reducer.reduce(__high2half(*rv2), &val2);
+      val = __halves2half2(val1, val2);
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                     typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static
+    #if !defined(EIGEN_HIPCC)
+    // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
+    //          (in the cxx11_tensor_reduction_gpu test)
+    //
+    // terminate called after throwing an instance of 'std::runtime_error'
+    //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
+    //
+    // don't know why this happens (and why is it a runtime error instead of a compile time error)
+    //
+    // this will be fixed by HIP PR#457
+    EIGEN_DEVICE_FUNC
+    #endif
+    bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                             device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, double* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                             device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<double, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+// clang-format on
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index e0e845601cf35eec7990ef149bdc8833780dad51..b493ecedd9651e2986d6c015ea58e3730958a47c 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,10 +18,10 @@ set(FLUID_CORE_NAME "core")
 if(WITH_AVX AND AVX_FOUND)
   set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_avx")
   if(NOT DEFINED NOAVX_CORE_FILE OR NOAVX_CORE_FILE STREQUAL "")
-    message(STATUS "WARNING: This is just a warning for publishing release.
+    message(STATUS "MESSAGE: This is just a message for publishing release.
       You are building AVX version without NOAVX core.
       So the wheel package may fail on NOAVX machine.
-      You can add -DFLUID_CORE_NAME=/path/to/your/core_noavx.* in cmake command
+      You can add -DNOAVX_CORE_FILE=/path/to/your/core_noavx.* in cmake command
       to get a full wheel package to resolve this warning.
       While, this version will still work on local machine.")
   endif()
@@ -43,9 +43,20 @@ set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
 IF(WIN32)
     # Python would use the .pyd by default under Windows series platform
     set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd)
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)
+    set(FLUID_CORE_LIB ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.lib)
+    
+    add_custom_command(OUTPUT ${FLUID_CORE}
+      COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+      COMMAND cmake -E copy $<TARGET_LINKER_FILE:paddle_pybind> ${FLUID_CORE_LIB}
+      DEPENDS paddle_pybind)
+
+    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)  
 ELSE()
     set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+        DEPENDS paddle_pybind)
+
     set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
 ENDIF()
 
@@ -68,21 +79,20 @@ if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}")
   list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE})
 endif()
 
-add_custom_command(OUTPUT ${FLUID_CORE}
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-        DEPENDS paddle_pybind)
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS})
 
 IF(WIN32)
   add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/
     COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMENT "Packing whl packages------>>>"
     DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ELSE(WIN32)
   add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND touch stub.cc
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMENT "Packing whl packages------>>>"
     DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ENDIF()
 
diff --git a/python/paddle/README.md b/python/paddle/README.rst
similarity index 100%
rename from python/paddle/README.md
rename to python/paddle/README.rst
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 8dabe19f57c58f34d18d28f63af8672622c27881..ee4dcaa8979407ee0bcfc4e02af42c08c05efb03 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -11,9 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import os
-
 try:
     from paddle.version import full_version as __version__
     from paddle.version import commit as __git_commit__
@@ -30,258 +27,469 @@ from .fluid import monkey_patch_variable
 from .fluid.dygraph import monkey_patch_math_varbase
 monkey_patch_variable()
 monkey_patch_math_varbase()
-import paddle.framework
-from .framework import VarBase as Tensor
-import paddle.compat
-import paddle.distributed
-import paddle.sysconfig
-import paddle.tensor
-import paddle.distribution
-import paddle.nn
-import paddle.distributed.fleet
-import paddle.optimizer
-import paddle.metric
-import paddle.device
-import paddle.regularizer
-import paddle.incubate
+from .framework.dtype import dtype as dtype  # noqa: F401
+from paddle.framework.dtype import uint8  # noqa: F401
+from paddle.framework.dtype import int8  # noqa: F401
+from paddle.framework.dtype import int16  # noqa: F401
+from paddle.framework.dtype import int32  # noqa: F401
+from paddle.framework.dtype import int64  # noqa: F401
+from paddle.framework.dtype import float16  # noqa: F401
+from paddle.framework.dtype import float32  # noqa: F401
+from paddle.framework.dtype import float64  # noqa: F401
+from paddle.framework.dtype import bfloat16  # noqa: F401
+from paddle.framework.dtype import bool  # noqa: F401
+from paddle.framework.dtype import complex64  # noqa: F401
+from paddle.framework.dtype import complex128  # noqa: F401
+from .framework import VarBase as Tensor  # noqa: F401
+Tensor.__qualname__ = 'Tensor'  # noqa: F401
+import paddle.compat  # noqa: F401
+import paddle.distributed  # noqa: F401
+import paddle.sysconfig  # noqa: F401
+import paddle.distribution  # noqa: F401
+import paddle.nn  # noqa: F401
+import paddle.distributed.fleet  # noqa: F401
+import paddle.optimizer  # noqa: F401
+import paddle.metric  # noqa: F401
+import paddle.regularizer  # noqa: F401
+import paddle.incubate  # noqa: F401
+import paddle.autograd  # noqa: F401
 
-# TODO: define alias in tensor and framework directory
+import paddle.jit  # noqa: F401
+import paddle.amp  # noqa: F401
+import paddle.dataset  # noqa: F401
+import paddle.inference  # noqa: F401
+import paddle.io  # noqa: F401
+import paddle.onnx  # noqa: F401
+import paddle.reader  # noqa: F401
+import paddle.static  # noqa: F401
+import paddle.vision  # noqa: F401
 
-from .tensor.random import randperm
-from .tensor.random import bernoulli
+from .tensor.random import bernoulli  # noqa: F401
 
-from .tensor.attribute import rank  #DEFINE_ALIAS
-from .tensor.attribute import shape  #DEFINE_ALIAS
-from .tensor.attribute import real  #DEFINE_ALIAS
-from .tensor.attribute import imag  #DEFINE_ALIAS
-from .tensor.creation import to_tensor  #DEFINE_ALIAS
-from .tensor.creation import diag  #DEFINE_ALIAS
-from .tensor.creation import eye  #DEFINE_ALIAS
-# from .tensor.creation import fill_constant  #DEFINE_ALIAS
-# from .tensor.creation import get_tensor_from_selected_rows        #DEFINE_ALIAS
-from .tensor.creation import linspace  #DEFINE_ALIAS
-from .tensor.creation import ones  #DEFINE_ALIAS
-from .tensor.creation import ones_like  #DEFINE_ALIAS
-from .tensor.creation import zeros  #DEFINE_ALIAS
-from .tensor.creation import zeros_like  #DEFINE_ALIAS
-from .tensor.creation import arange  #DEFINE_ALIAS
-from .tensor.creation import eye  #DEFINE_ALIAS
-from .tensor.creation import full  #DEFINE_ALIAS
-from .tensor.creation import full_like  #DEFINE_ALIAS
-from .tensor.creation import triu  #DEFINE_ALIAS
-from .tensor.creation import tril  #DEFINE_ALIAS
-from .tensor.creation import meshgrid  #DEFINE_ALIAS
-from .tensor.creation import empty  #DEFINE_ALIAS
-from .tensor.creation import empty_like  #DEFINE_ALIAS
-from .tensor.creation import assign  #DEFINE_ALIAS
-from .tensor.linalg import matmul  #DEFINE_ALIAS
-from .tensor.linalg import dot  #DEFINE_ALIAS
-# from .tensor.linalg import einsum        #DEFINE_ALIAS
-from .tensor.linalg import norm  #DEFINE_ALIAS
-from .tensor.linalg import transpose  #DEFINE_ALIAS
-from .tensor.linalg import dist  #DEFINE_ALIAS
-from .tensor.linalg import t  #DEFINE_ALIAS
-from .tensor.linalg import cross  #DEFINE_ALIAS
-from .tensor.linalg import cholesky  #DEFINE_ALIAS
-# from .tensor.linalg import tensordot        #DEFINE_ALIAS
-from .tensor.linalg import bmm  #DEFINE_ALIAS
-from .tensor.linalg import histogram  #DEFINE_ALIAS
-from .tensor.linalg import mv  #DEFINE_ALIAS
-from .tensor.logic import equal  #DEFINE_ALIAS
-from .tensor.logic import greater_equal  #DEFINE_ALIAS
-from .tensor.logic import greater_than  #DEFINE_ALIAS
-from .tensor.logic import is_empty  #DEFINE_ALIAS
-#from .tensor.logic import isfinite  #DEFINE_ALIAS
-from .tensor.logic import less_equal  #DEFINE_ALIAS
-from .tensor.logic import less_than  #DEFINE_ALIAS
-from .tensor.logic import logical_and  #DEFINE_ALIAS
-from .tensor.logic import logical_not  #DEFINE_ALIAS
-from .tensor.logic import logical_or  #DEFINE_ALIAS
-from .tensor.logic import logical_xor  #DEFINE_ALIAS
-from .tensor.logic import not_equal  #DEFINE_ALIAS
-from .tensor.logic import allclose  #DEFINE_ALIAS
-from .tensor.logic import equal_all  #DEFINE_ALIAS
-# from .tensor.logic import isnan        #DEFINE_ALIAS
-from .tensor.logic import is_tensor  #DEFINE_ALIAS
-from .tensor.manipulation import cast  #DEFINE_ALIAS
-from .tensor.manipulation import concat  #DEFINE_ALIAS
-from .tensor.manipulation import expand  #DEFINE_ALIAS
-from .tensor.manipulation import broadcast_to  #DEFINE_ALIAS
-from .tensor.manipulation import expand_as  #DEFINE_ALIAS
-from .tensor.manipulation import tile  #DEFINE_ALIAS
-from .tensor.manipulation import flatten  #DEFINE_ALIAS
-from .tensor.manipulation import gather  #DEFINE_ALIAS
-from .tensor.manipulation import gather_nd  #DEFINE_ALIAS
-from .tensor.manipulation import reshape  #DEFINE_ALIAS
-from .tensor.manipulation import reshape_  #DEFINE_ALIAS
-from .tensor.manipulation import flip as reverse  #DEFINE_ALIAS
-from .tensor.manipulation import scatter  #DEFINE_ALIAS
-from .tensor.manipulation import scatter_  #DEFINE_ALIAS
-from .tensor.manipulation import scatter_nd_add  #DEFINE_ALIAS
-from .tensor.manipulation import scatter_nd  #DEFINE_ALIAS
-from .tensor.manipulation import shard_index  #DEFINE_ALIAS
-from .tensor.manipulation import slice  #DEFINE_ALIAS
-from .tensor.manipulation import split  #DEFINE_ALIAS
-from .tensor.manipulation import squeeze  #DEFINE_ALIAS
-from .tensor.manipulation import squeeze_  #DEFINE_ALIAS
-from .tensor.manipulation import stack  #DEFINE_ALIAS
-from .tensor.manipulation import strided_slice  #DEFINE_ALIAS
-from .tensor.manipulation import transpose  #DEFINE_ALIAS
-from .tensor.manipulation import unique  #DEFINE_ALIAS
-from .tensor.manipulation import unsqueeze  #DEFINE_ALIAS
-from .tensor.manipulation import unsqueeze_  #DEFINE_ALIAS
-from .tensor.manipulation import unstack  #DEFINE_ALIAS
-from .tensor.manipulation import flip  #DEFINE_ALIAS
-from .tensor.manipulation import unbind  #DEFINE_ALIAS
-from .tensor.manipulation import roll  #DEFINE_ALIAS
-from .tensor.manipulation import chunk  #DEFINE_ALIAS
-from .tensor.math import abs  #DEFINE_ALIAS
-from .tensor.math import acos  #DEFINE_ALIAS
-from .tensor.math import asin  #DEFINE_ALIAS
-from .tensor.math import atan  #DEFINE_ALIAS
-from .tensor.math import ceil  #DEFINE_ALIAS
-from .tensor.math import cos  #DEFINE_ALIAS
-from .tensor.math import tan  #DEFINE_ALIAS
-from .tensor.math import cosh  #DEFINE_ALIAS
-from .tensor.math import cumsum  #DEFINE_ALIAS
-# from .tensor.math import elementwise_add  #DEFINE_ALIAS
-# from .tensor.math import elementwise_div  #DEFINE_ALIAS
-# from .tensor.math import elementwise_floordiv  #DEFINE_ALIAS
-# from .tensor.math import elementwise_mod  #DEFINE_ALIAS
-# from .tensor.math import elementwise_pow  #DEFINE_ALIAS
-# from .tensor.math import elementwise_sub  #DEFINE_ALIAS
-from .tensor.math import exp  #DEFINE_ALIAS
-from .tensor.math import floor  #DEFINE_ALIAS
-from .tensor.math import increment  #DEFINE_ALIAS
-from .tensor.math import log  #DEFINE_ALIAS
-from .tensor.math import log2  #DEFINE_ALIAS
-from .tensor.math import log10  #DEFINE_ALIAS
-from .tensor.math import multiplex  #DEFINE_ALIAS
-from .tensor.math import pow  #DEFINE_ALIAS
-from .tensor.math import reciprocal  #DEFINE_ALIAS
-# from .tensor.math import reduce_max  #DEFINE_ALIAS
-# from .tensor.math import reduce_min  #DEFINE_ALIAS
-# from .tensor.math import reduce_prod  #DEFINE_ALIAS
-# from .tensor.math import reduce_sum  #DEFINE_ALIAS
-from .tensor.math import all  #DEFINE_ALIAS
-from .tensor.math import any  #DEFINE_ALIAS
-from .tensor.math import round  #DEFINE_ALIAS
-from .tensor.math import rsqrt  #DEFINE_ALIAS
-from .tensor.math import scale  #DEFINE_ALIAS
-from .tensor.math import sign  #DEFINE_ALIAS
-from .tensor.math import sin  #DEFINE_ALIAS
-from .tensor.math import sinh  #DEFINE_ALIAS
-from .tensor.math import sqrt  #DEFINE_ALIAS
-from .tensor.math import square  #DEFINE_ALIAS
-from .tensor.math import stanh  #DEFINE_ALIAS
-from .tensor.math import sum  #DEFINE_ALIAS
-from .tensor.math import tanh  #DEFINE_ALIAS
-from .tensor.math import tanh_  #DEFINE_ALIAS
-from .tensor.math import add_n  #DEFINE_ALIAS
-from .tensor.math import max  #DEFINE_ALIAS
-from .tensor.math import maximum  #DEFINE_ALIAS
-from .tensor.math import min  #DEFINE_ALIAS
-from .tensor.math import minimum  #DEFINE_ALIAS
-from .tensor.math import mm  #DEFINE_ALIAS
-from .tensor.math import divide  #DEFINE_ALIAS
-from .tensor.math import floor_divide  #DEFINE_ALIAS
-from .tensor.math import remainder  #DEFINE_ALIAS
-from .tensor.math import mod  #DEFINE_ALIAS
-from .tensor.math import floor_mod  #DEFINE_ALIAS
-from .tensor.math import multiply  #DEFINE_ALIAS
-from .tensor.math import add  #DEFINE_ALIAS
-from .tensor.math import subtract  #DEFINE_ALIAS
-from .tensor.math import atan  #DEFINE_ALIAS
-from .tensor.math import logsumexp  #DEFINE_ALIAS
-from .tensor.math import inverse  #DEFINE_ALIAS
-from .tensor.math import log1p  #DEFINE_ALIAS
-from .tensor.math import erf  #DEFINE_ALIAS
-from .tensor.math import addmm  #DEFINE_ALIAS
-from .tensor.math import clip  #DEFINE_ALIAS
-from .tensor.math import trace  #DEFINE_ALIAS
-from .tensor.math import kron  #DEFINE_ALIAS
-from .tensor.math import isfinite  #DEFINE_ALIAS
-from .tensor.math import isinf  #DEFINE_ALIAS
-from .tensor.math import isnan  #DEFINE_ALIAS
-from .tensor.math import prod  #DEFINE_ALIAS
-from .tensor.math import broadcast_shape  #DEFINE_ALIAS
-from .tensor.math import conj  #DEFINE_ALIAS
+from .tensor.attribute import rank  # noqa: F401
+from .tensor.attribute import shape  # noqa: F401
+from .tensor.attribute import real  # noqa: F401
+from .tensor.attribute import imag  # noqa: F401
+from .tensor.creation import to_tensor  # noqa: F401
+from .tensor.creation import diag  # noqa: F401
+from .tensor.creation import eye  # noqa: F401
+from .tensor.creation import linspace  # noqa: F401
+from .tensor.creation import ones  # noqa: F401
+from .tensor.creation import ones_like  # noqa: F401
+from .tensor.creation import zeros  # noqa: F401
+from .tensor.creation import zeros_like  # noqa: F401
+from .tensor.creation import arange  # noqa: F401
+from .tensor.creation import full  # noqa: F401
+from .tensor.creation import full_like  # noqa: F401
+from .tensor.creation import triu  # noqa: F401
+from .tensor.creation import tril  # noqa: F401
+from .tensor.creation import meshgrid  # noqa: F401
+from .tensor.creation import empty  # noqa: F401
+from .tensor.creation import empty_like  # noqa: F401
+from .tensor.creation import assign  # noqa: F401
+from .tensor.linalg import matmul  # noqa: F401
+from .tensor.linalg import dot  # noqa: F401
+from .tensor.linalg import norm  # noqa: F401
+from .tensor.linalg import transpose  # noqa: F401
+from .tensor.linalg import dist  # noqa: F401
+from .tensor.linalg import t  # noqa: F401
+from .tensor.linalg import cross  # noqa: F401
+from .tensor.linalg import cholesky  # noqa: F401
+from .tensor.linalg import bmm  # noqa: F401
+from .tensor.linalg import histogram  # noqa: F401
+from .tensor.linalg import mv  # noqa: F401
+from .tensor.logic import equal  # noqa: F401
+from .tensor.logic import greater_equal  # noqa: F401
+from .tensor.logic import greater_than  # noqa: F401
+from .tensor.logic import is_empty  # noqa: F401
+from .tensor.logic import less_equal  # noqa: F401
+from .tensor.logic import less_than  # noqa: F401
+from .tensor.logic import logical_and  # noqa: F401
+from .tensor.logic import logical_not  # noqa: F401
+from .tensor.logic import logical_or  # noqa: F401
+from .tensor.logic import logical_xor  # noqa: F401
+from .tensor.logic import not_equal  # noqa: F401
+from .tensor.logic import allclose  # noqa: F401
+from .tensor.logic import equal_all  # noqa: F401
+from .tensor.logic import is_tensor  # noqa: F401
+from .tensor.manipulation import cast  # noqa: F401
+from .tensor.manipulation import concat  # noqa: F401
+from .tensor.manipulation import expand  # noqa: F401
+from .tensor.manipulation import broadcast_to  # noqa: F401
+from .tensor.manipulation import expand_as  # noqa: F401
+from .tensor.manipulation import tile  # noqa: F401
+from .tensor.manipulation import flatten  # noqa: F401
+from .tensor.manipulation import gather  # noqa: F401
+from .tensor.manipulation import gather_nd  # noqa: F401
+from .tensor.manipulation import reshape  # noqa: F401
+from .tensor.manipulation import reshape_  # noqa: F401
+from .tensor.manipulation import flip as reverse  # noqa: F401
+from .tensor.manipulation import scatter  # noqa: F401
+from .tensor.manipulation import scatter_  # noqa: F401
+from .tensor.manipulation import scatter_nd_add  # noqa: F401
+from .tensor.manipulation import scatter_nd  # noqa: F401
+from .tensor.manipulation import shard_index  # noqa: F401
+from .tensor.manipulation import slice  # noqa: F401
+from .tensor.manipulation import split  # noqa: F401
+from .tensor.manipulation import squeeze  # noqa: F401
+from .tensor.manipulation import squeeze_  # noqa: F401
+from .tensor.manipulation import stack  # noqa: F401
+from .tensor.manipulation import strided_slice  # noqa: F401
+from .tensor.manipulation import transpose  # noqa: F401
+from .tensor.manipulation import unique  # noqa: F401
+from .tensor.manipulation import unsqueeze  # noqa: F401
+from .tensor.manipulation import unsqueeze_  # noqa: F401
+from .tensor.manipulation import unstack  # noqa: F401
+from .tensor.manipulation import flip  # noqa: F401
+from .tensor.manipulation import unbind  # noqa: F401
+from .tensor.manipulation import roll  # noqa: F401
+from .tensor.manipulation import chunk  # noqa: F401
+from .tensor.manipulation import tolist  # noqa: F401
+from .tensor.math import abs  # noqa: F401
+from .tensor.math import acos  # noqa: F401
+from .tensor.math import asin  # noqa: F401
+from .tensor.math import atan  # noqa: F401
+from .tensor.math import ceil  # noqa: F401
+from .tensor.math import cos  # noqa: F401
+from .tensor.math import tan  # noqa: F401
+from .tensor.math import cosh  # noqa: F401
+from .tensor.math import cumsum  # noqa: F401
+from .tensor.math import exp  # noqa: F401
+from .tensor.math import floor  # noqa: F401
+from .tensor.math import increment  # noqa: F401
+from .tensor.math import log  # noqa: F401
+from .tensor.math import log2  # noqa: F401
+from .tensor.math import log10  # noqa: F401
+from .tensor.math import multiplex  # noqa: F401
+from .tensor.math import pow  # noqa: F401
+from .tensor.math import reciprocal  # noqa: F401
+from .tensor.math import all  # noqa: F401
+from .tensor.math import any  # noqa: F401
+from .tensor.math import round  # noqa: F401
+from .tensor.math import rsqrt  # noqa: F401
+from .tensor.math import scale  # noqa: F401
+from .tensor.math import sign  # noqa: F401
+from .tensor.math import sin  # noqa: F401
+from .tensor.math import sinh  # noqa: F401
+from .tensor.math import sqrt  # noqa: F401
+from .tensor.math import square  # noqa: F401
+from .tensor.math import stanh  # noqa: F401
+from .tensor.math import sum  # noqa: F401
+from .tensor.math import tanh  # noqa: F401
+from .tensor.math import tanh_  # noqa: F401
+from .tensor.math import add_n  # noqa: F401
+from .tensor.math import max  # noqa: F401
+from .tensor.math import maximum  # noqa: F401
+from .tensor.math import min  # noqa: F401
+from .tensor.math import minimum  # noqa: F401
+from .tensor.math import mm  # noqa: F401
+from .tensor.math import divide  # noqa: F401
+from .tensor.math import floor_divide  # noqa: F401
+from .tensor.math import remainder  # noqa: F401
+from .tensor.math import mod  # noqa: F401
+from .tensor.math import floor_mod  # noqa: F401
+from .tensor.math import multiply  # noqa: F401
+from .tensor.math import add  # noqa: F401
+from .tensor.math import subtract  # noqa: F401
+from .tensor.math import atan  # noqa: F401
+from .tensor.math import logsumexp  # noqa: F401
+from .tensor.math import inverse  # noqa: F401
+from .tensor.math import log1p  # noqa: F401
+from .tensor.math import erf  # noqa: F401
+from .tensor.math import addmm  # noqa: F401
+from .tensor.math import clip  # noqa: F401
+from .tensor.math import trace  # noqa: F401
+from .tensor.math import kron  # noqa: F401
+from .tensor.math import isfinite  # noqa: F401
+from .tensor.math import isinf  # noqa: F401
+from .tensor.math import isnan  # noqa: F401
+from .tensor.math import prod  # noqa: F401
+from .tensor.math import broadcast_shape  # noqa: F401
+from .tensor.math import conj  # noqa: F401
 
-from .tensor.random import multinomial  #DEFINE_ALIAS
-from .tensor.random import standard_normal
-from .tensor.random import normal
-from .tensor.random import uniform  #DEFINE_ALIAS
-from .tensor.random import randn  #DEFINE_ALIAS
-from .tensor.random import rand  #DEFINE_ALIAS
-from .tensor.random import randint  #DEFINE_ALIAS
-from .tensor.random import randperm  #DEFINE_ALIAS
-from .tensor.search import argmax  #DEFINE_ALIAS
-from .tensor.search import argmin  #DEFINE_ALIAS
-from .tensor.search import argsort  #DEFINE_ALIAS
-# from .tensor.search import has_inf  #DEFINE_ALIAS
-# from .tensor.search import has_nan  #DEFINE_ALIAS
-from .tensor.search import masked_select  #DEFINE_ALIAS
-from .tensor.search import topk  #DEFINE_ALIAS
-from .tensor.search import where  #DEFINE_ALIAS
-from .tensor.search import index_select  #DEFINE_ALIAS
-from .tensor.search import nonzero  #DEFINE_ALIAS
-from .tensor.search import sort  #DEFINE_ALIAS
+from .tensor.random import multinomial  # noqa: F401
+from .tensor.random import standard_normal  # noqa: F401
+from .tensor.random import normal  # noqa: F401
+from .tensor.random import uniform  # noqa: F401
+from .tensor.random import randn  # noqa: F401
+from .tensor.random import rand  # noqa: F401
+from .tensor.random import randint  # noqa: F401
+from .tensor.random import randperm  # noqa: F401
+from .tensor.search import argmax  # noqa: F401
+from .tensor.search import argmin  # noqa: F401
+from .tensor.search import argsort  # noqa: F401
+from .tensor.search import masked_select  # noqa: F401
+from .tensor.search import topk  # noqa: F401
+from .tensor.search import where  # noqa: F401
+from .tensor.search import index_select  # noqa: F401
+from .tensor.search import nonzero  # noqa: F401
+from .tensor.search import sort  # noqa: F401
 
-from .tensor.to_string import set_printoptions  #DEFINE_ALIAS
+from .tensor.to_string import set_printoptions  # noqa: F401
 
-from .framework.random import seed  #DEFINE_ALIAS
-from .framework.random import get_cuda_rng_state  #DEFINE_ALIAS
-from .framework.random import set_cuda_rng_state  #DEFINE_ALIAS
-from .framework import ParamAttr  #DEFINE_ALIAS
-# from .framework import create_global_var  #DEFINE_ALIAS
-from .framework import create_parameter  #DEFINE_ALIAS
-from .framework import CPUPlace  #DEFINE_ALIAS
-from .framework import CUDAPlace  #DEFINE_ALIAS
-from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
+from .framework.random import seed  # noqa: F401
+from .framework.random import get_cuda_rng_state  # noqa: F401
+from .framework.random import set_cuda_rng_state  # noqa: F401
+from .framework import ParamAttr  # noqa: F401
+from .framework import create_parameter  # noqa: F401
+from .framework import CPUPlace  # noqa: F401
+from .framework import CUDAPlace  # noqa: F401
+from .framework import NPUPlace  # noqa: F401
+from .framework import CUDAPinnedPlace  # noqa: F401
 
-from .framework import grad  #DEFINE_ALIAS
-from .framework import no_grad  #DEFINE_ALIAS
-from .framework import save  #DEFINE_ALIAS
-from .framework import load  #DEFINE_ALIAS
-from .framework import DataParallel  #DEFINE_ALIAS
+from .framework import grad  # noqa: F401
+from .framework import no_grad  # noqa: F401
+from .framework import set_grad_enabled  # noqa: F401
+from .framework import save  # noqa: F401
+from .framework import load  # noqa: F401
+from .framework import DataParallel  # noqa: F401
 
 from .framework import set_default_dtype  #DEFINE_ALIAS
 from .framework import get_default_dtype  #DEFINE_ALIAS
+from .framework import set_grad_enabled  #DEFINE_ALIAS
 
-from .tensor.search import index_sample  #DEFINE_ALIAS
-from .tensor.stat import mean  #DEFINE_ALIAS
-# from .tensor.stat import reduce_mean  #DEFINE_ALIAS
-from .tensor.stat import std  #DEFINE_ALIAS
-from .tensor.stat import var  #DEFINE_ALIAS
-# from .fluid.data import data
-from .tensor.stat import numel  #DEFINE_ALIAS
-from .tensor.stat import median  #DEFINE_ALIAS
-from .device import get_cudnn_version
-from .device import set_device
-from .device import get_device
-from .device import is_compiled_with_cuda  #DEFINE_ALIAS
-from .device import is_compiled_with_xpu
-from .device import XPUPlace
-# from .tensor.tensor import Tensor        #DEFINE_ALIAS
-# from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
-# from .tensor.tensor import LoDTensorArray        #DEFINE_ALIAS
-
-from .fluid.dygraph.base import enable_dygraph as disable_static  #DEFINE_ALIAS
-from .fluid.dygraph.base import disable_dygraph as enable_static  #DEFINE_ALIAS
-from .fluid.framework import in_dygraph_mode as in_dynamic_mode  #DEFINE_ALIAS
-from .fluid.layers import crop_tensor as crop  #DEFINE_ALIAS
+from .tensor.search import index_sample  # noqa: F401
+from .tensor.stat import mean  # noqa: F401
+from .tensor.stat import std  # noqa: F401
+from .tensor.stat import var  # noqa: F401
+from .tensor.stat import numel  # noqa: F401
+from .tensor.stat import median  # noqa: F401
+from .device import get_cudnn_version  # noqa: F401
+from .device import set_device  # noqa: F401
+from .device import get_device  # noqa: F401
+from .fluid.framework import is_compiled_with_cuda  # noqa: F401
+from .device import is_compiled_with_xpu  # noqa: F401
+from .device import is_compiled_with_npu  # noqa: F401
+from .device import XPUPlace  # noqa: F401
 
-from . import jit
-from . import static
-from . import amp
-from . import onnx
+from .fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
+from .fluid.dygraph.base import disable_dygraph as enable_static  # noqa: F401
+from .fluid.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
+from .fluid.layers import crop_tensor as crop  # noqa: F401
 
 # high-level api
-from .hapi import Model
-from .hapi import callbacks
-from .hapi import summary
-from .hapi import flops
-import paddle.text
-import paddle.vision
+from .hapi import Model  # noqa: F401
+from . import callbacks  # noqa: F401
+from .hapi import summary  # noqa: F401
+from .hapi import flops  # noqa: F401
+from . import hub  # noqa: F401
 
+import paddle.text  # noqa: F401
+import paddle.vision  # noqa: F401
+
+from .tensor.random import check_shape  # noqa: F401
 disable_static()
+
+__all__ = [     #noqa
+           'dtype',
+           'uint8',
+           'int8',
+           'int16',
+           'int32',
+           'int64',
+           'float16',
+           'float32',
+           'float64',
+           'bfloat16',
+           'bool',
+           'complex64',
+           'complex128',
+           'addmm',
+           'allclose',
+           't',
+           'add',
+           'subtract',
+           'diag',
+           'isnan',
+           'scatter_nd_add',
+           'unstack',
+           'get_default_dtype',
+           'save',
+           'multinomial',
+           'get_cuda_rng_state',
+           'rank',
+           'empty_like',
+           'eye',
+           'cumsum',
+           'sign',
+           'is_empty',
+           'equal',
+           'equal_all',
+           'is_tensor',
+           'cross',
+           'where',
+           'log1p',
+           'cos',
+           'tan',
+           'mean',
+           'XPUPlace',
+           'mv',
+           'in_dynamic_mode',
+           'min',
+           'any',
+           'slice',
+           'normal',
+           'logsumexp',
+           'full',
+           'unsqueeze',
+           'unsqueeze_',
+           'argmax',
+           'Model',
+           'summary',
+           'flops',
+           'sort',
+           'split',
+           'logical_and',
+           'full_like',
+           'less_than',
+           'kron',
+           'clip',
+           'Tensor',
+           'crop',
+           'ParamAttr',
+           'stanh',
+           'randint',
+           'assign',
+           'gather',
+           'scale',
+           'zeros',
+           'rsqrt',
+           'squeeze',
+           'squeeze_',
+           'to_tensor',
+           'gather_nd',
+           'isinf',
+           'set_device',
+           'uniform',
+           'floor_divide',
+           'remainder',
+           'floor_mod',
+           'roll',
+           'batch',
+           'max',
+           'norm',
+           'logical_or',
+           'mm',
+           'flip',
+           'histogram',
+           'multiplex',
+           'CUDAPlace',
+           'NPUPlace',
+           'empty',
+           'shape',
+           'real',
+           'imag',
+           'reciprocal',
+           'rand',
+           'less_equal',
+           'triu',
+           'is_compiled_with_cuda',
+           'sin',
+           'dist',
+           'unbind',
+           'meshgrid',
+           'arange',
+           'load',
+           'numel',
+           'median',
+           'inverse',
+           'no_grad',
+           'set_grad_enabled',
+           'mod',
+           'abs',
+           'tril',
+           'pow',
+           'zeros_like',
+           'maximum',
+           'topk',
+           'index_select',
+           'CPUPlace',
+           'matmul',
+           'seed',
+           'acos',
+           'logical_xor',
+           'exp',
+           'bernoulli',
+           'summary',
+           'sinh',
+           'is_compiled_with_xpu',
+           'is_compiled_with_npu',
+           'round',
+           'DataParallel',
+           'argmin',
+           'prod',
+           'broadcast_shape',
+           'conj',
+           'square',
+           'divide',
+           'ceil',
+           'atan',
+           'expand',
+           'broadcast_to',
+           'ones_like',
+           'index_sample',
+           'cast',
+           'grad',
+           'all',
+           'ones',
+           'not_equal',
+           'sum',
+           'tile',
+           'get_device',
+           'greater_equal',
+           'isfinite',
+           'create_parameter',
+           'dot',
+           'increment',
+           'erf',
+           'bmm',
+           'chunk',
+           'tolist',
+           'greater_than',
+           'shard_index',
+           'argsort',
+           'tanh',
+           'tanh_',
+           'transpose',
+           'randn',
+           'strided_slice',
+           'unique',
+           'set_cuda_rng_state',
+           'set_printoptions',
+           'std',
+           'flatten',
+           'asin',
+           'multiply',
+           'disable_static',
+           'masked_select',
+           'var',
+           'trace',
+           'enable_static',
+           'scatter_nd',
+           'set_default_dtype',
+           'expand_as',
+           'get_cudnn_version',
+           'stack',
+           'sqrt',
+           'cholesky',
+           'randperm',
+           'linspace',
+           'reshape',
+           'reshape_',
+           'reverse',
+           'nonzero',
+           'CUDAPinnedPlace',
+           'logical_not',
+           'add_n',
+           'minimum',
+           'ComplexTensor',
+           'scatter',
+           'scatter_',
+           'floor',
+           'cosh',
+           'log',
+           'log2',
+           'log10',
+           'concat',
+           'check_shape'
+]
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 441bc31b93684f94fd1dc36183679f493c03ada0..b83f81b27d1a0745c7a2f3339bc3939eb2f19490 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -28,10 +28,10 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
 
     Args:
         enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
-        custom_white_list(set|list, optional): The custom white_list. It's the set of ops that support
+        custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
              fp16 calculation and are considered numerically-safe and performance-critical. These ops 
              will be converted to fp16.
-        custom_black_list(set|list, optional): The custom black_list. The set of ops that support fp16
+        custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
              calculation and are considered numerically-dangerous and whose effects may also be 
              observed in downstream ops. These ops will not be converted to fp16.
         
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 64b34ce8345635711753532d6081e414844fa3fc..72a67a92c495863aba62bdaa93811e59780ed846 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -62,6 +62,7 @@ class GradScaler(AmpScaler):
             scaled = scaler.scale(loss)  # scale the loss 
             scaled.backward()            # do backward
             scaler.minimize(optimizer, scaled)  # update parameters     
+            optimizer.clear_grad()
     """
 
     def __init__(self,
@@ -105,6 +106,7 @@ class GradScaler(AmpScaler):
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
+                optimizer.clear_grad()
         """
         return super(GradScaler, self).scale(var)
 
@@ -140,5 +142,6 @@ class GradScaler(AmpScaler):
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
+                optimizer.clear_grad()
         """
         return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..71110e95817879fa55bcfa98293139a29b79997a
--- /dev/null
+++ b/python/paddle/autograd/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
+
+from . import backward_mode
+from .backward_mode import backward
+from .py_layer import PyLayer, PyLayerContext
+
+__all__ = ['grad', 'backward', 'PyLayer', 'PyLayerContext']
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
new file mode 100644
index 0000000000000000000000000000000000000000..96e4336abaa6fa9ca5f23a56c551b8002c347888
--- /dev/null
+++ b/python/paddle/autograd/backward_mode.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from paddle.fluid import framework
+import paddle
+__all__ = ['backward']
+
+
+@framework.dygraph_only
+def backward(tensors, grad_tensors=None, retain_graph=False):
+    """
+    Compute the backward gradients of given tensors.
+    
+    Args:
+        tensors(list of Tensors): the tensors which the gradient to be computed. The tensors can not contain the same tensor.
+
+        grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
+            and if any of the elements is None, then the init gradient is the default value which is filled with 1.0. 
+            If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
+            Defaults to None.
+
+        retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
+            like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+            :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+            Defaults to False.
+    
+    Returns:
+        NoneType: None
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32', stop_gradient=False)
+            y = paddle.to_tensor([[3, 2], [3, 4]], dtype='float32')
+
+            grad_tensor1 = paddle.to_tensor([[1,2], [2, 3]], dtype='float32')
+            grad_tensor2 = paddle.to_tensor([[1,1], [1, 1]], dtype='float32')
+
+            z1 = paddle.matmul(x, y)
+            z2 = paddle.matmul(x, y)
+
+            paddle.autograd.backward([z1, z2], [grad_tensor1, grad_tensor2], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
+
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2], [grad_tensor1, None], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
+
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2])
+            print(x.grad)
+            #[[10. 14.]
+            # [10. 14.]]
+
+    """
+
+    def check_tensors(in_out_list, name):
+        assert in_out_list is not None, "{} should not be None".format(name)
+
+        if isinstance(in_out_list, (list, tuple)):
+            assert len(in_out_list) > 0, "{} connot be empyt".format(name)
+            for each_var in in_out_list:
+                assert isinstance(
+                    each_var, paddle.
+                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
+            return in_out_list
+        else:
+            assert isinstance(
+                in_out_list,
+                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
+                    name)
+            return [in_out_list]
+
+    tensors = check_tensors(tensors, "tensors")
+
+    assert len(tensors) == len(
+        set(tensors)
+    ), "The argument 'tensors' of paddle.autograd.backward contains duplicate paddle.Tensor object."
+
+    if grad_tensors is not None:
+        if not isinstance(grad_tensors, (list, tuple)):
+            grad_tensors = [grad_tensors]
+
+        for each_tensor in grad_tensors:
+            if each_tensor is not None:
+                assert isinstance(
+                    each_tensor, paddle.Tensor
+                ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
+    else:
+        grad_tensors = [None] * len(tensors)
+
+    if len(grad_tensors) > 0:
+        assert len(tensors) == len(
+            grad_tensors), "The length of grad_tensors must be equal to tensors"
+
+    assert isinstance(retain_graph, bool), "retain_graph must be True or False"
+
+    core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
+                              framework._dygraph_tracer())
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..35e2cd24391775c6e9144d555e68ab12295385b6
--- /dev/null
+++ b/python/paddle/autograd/py_layer.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.framework import dygraph_only
+from paddle.fluid import core
+__all__ = ['PyLayer', 'PyLayerContext']
+
+
+class PyLayerContext(object):
+    """
+    The object of this class is a context that is used in PyLayer to enhance the function.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.autograd import PyLayer
+
+            class cus_tanh(PyLayer):
+                @staticmethod
+                def forward(ctx, x):
+                    # ctx is a object of PyLayerContext.
+                    y = paddle.tanh(x)
+                    ctx.save_for_backward(y)
+                    return y
+
+                @staticmethod
+                def backward(ctx, dy):
+                    # ctx is a object of PyLayerContext.
+                    y, = ctx.saved_tensor()
+                    grad = dy * (1 - paddle.square(y))
+                    return grad
+    """
+
+    def __init__(self):
+        self.container = None
+
+    def save_for_backward(self, *tensors):
+        """
+        Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
+        
+        .. note::
+            This API should be called at most once, and only inside `forward`. 
+
+        Args:
+            tensors(list of Tensors): Tensors to be stored.
+
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        # ctx is a context object that store some objects for backward.
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+
+        """
+        self.container = tensors
+
+    def saved_tensor(self):
+        """
+        Get the tensors stored by ``save_for_backward``.
+
+        Returns:
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            then return these tensors, otherwise return None.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        # ctx is a context object that store some objects for backward.
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+        """
+
+        return self.container
+
+
+def with_mateclass(meta, *bases):
+    class impl(meta):
+        def __new__(cls, name, temp_bases, attrs):
+            return meta(name, bases, attrs)
+
+    return type.__new__(impl, "impl", (), {})
+
+
+class CPyLayer(object):
+    @classmethod
+    @dygraph_only
+    def apply(cls, *args, **kwargs):
+        """
+        After building the custom PyLayer, run it through the ``apply``.
+
+        Args:
+            *args(tuple): input of PyLayer.
+            **kwargs(dict): input of PyLayer.
+
+        Returns:
+            tensors or other types : output of PyLayer.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x, func1, func2=paddle.square):
+                        ctx.func = func2
+                        y = func1(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - ctx.func(y))
+                        return grad
+
+
+                data = paddle.randn([2, 3], dtype="float64")
+                data.stop_gradient = False
+                # run custom Layer.
+                z = cus_tanh.apply(data, func1=paddle.tanh)
+        """
+        place = paddle.fluid.framework._current_expected_place()
+        with paddle.fluid.dygraph.no_grad():
+            return core.pylayer_apply(place, cls, *args, **kwargs)
+
+
+class PyLayerBackward(PyLayerContext):
+    def backward(self, *args, **kwargs):
+        with paddle.fluid.dygraph.guard():
+            with paddle.fluid.dygraph.no_grad():
+                return self._forward_cls.backward(*args, **kwargs)
+
+
+class LayerMeta(type):
+    def __init__(cls, name, bases, attrs):
+        cls._backward_function = type(name + '_backward', (PyLayerBackward, ),
+                                      {"_forward_cls": cls})
+
+        return super(LayerMeta, cls).__init__(name, bases, attrs)
+
+
+class PyLayer(with_mateclass(LayerMeta, CPyLayer)):
+    """
+    Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
+    1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
+    Their first argument should be a context and `None` can not be included in the returned result.
+    2. Input of backward contains a context as the first argument, and the rest arguments are the 
+    gradient of forward's output tensors. so the number of backward's input tensors equal to 
+    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`, 
+    you can use `save_for_backward` to store the required tensors, and then use them in the backward.
+    3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
+    Output tensors of backward are the gradient of forward's input tensors, 
+    so the number of backward's output tensors equal to the number of forward input tensors.
+    After building the custom Layer, run it through the `apply` method.
+    
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.autograd import PyLayer
+
+            # Inherit from PyLayer
+            class cus_tanh(PyLayer):
+                @staticmethod
+                def forward(ctx, x, func1, func2=paddle.square):
+                    # ctx is a context object that store some objects for backward.
+                    ctx.func = func2
+                    y = func1(x)
+                    # Pass tensors to backward.
+                    ctx.save_for_backward(y)
+                    return y
+
+                @staticmethod
+                # forward has only one output, so there is only one gradient in the input of backward.
+                def backward(ctx, dy):
+                    # Get the tensors passed by forward.
+                    y, = ctx.saved_tensor()
+                    grad = dy * (1 - ctx.func(y))
+                    # forward has only one input, so only one gradient tensor is returned.
+                    return grad
+
+
+            data = paddle.randn([2, 3], dtype="float64")
+            data.stop_gradient = False
+            z = cus_tanh.apply(data, func1=paddle.tanh)
+            z.mean().backward()
+
+            print(data.grad)
+
+    """
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
+        the first argument, followed by any number of arguments (tensors or other types). 
+        `None` can not be included in the returned result.
+
+        Args:
+            *args(tuple): input of PyLayer.
+            **kwargs(dict): input of PyLayer.
+
+        Returns:
+            tensors or other types : output of PyLayer.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+        """
+        raise NotImplementedError(
+            "You must implement the forward function for PyLayer.")
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        """
+        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
+        It must accept a object of `PyLayerContext` as the first argument, and the rest 
+        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        are the gradient of forward's input tensors.
+
+        Args:
+            *args(tuple): The gradient of forward's output tensor(s).
+            **kwargs(dict): The gradient of forward's output tensor(s).
+
+        Returns:
+            Tensor or list of Tensors: The gradient of forward's input tensor(s).
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from paddle.autograd import PyLayer
+
+                class cus_tanh(PyLayer):
+                    @staticmethod
+                    def forward(ctx, x):
+                        y = paddle.tanh(x)
+                        # Pass tensors to backward.
+                        ctx.save_for_backward(y)
+                        return y
+
+                    @staticmethod
+                    def backward(ctx, dy):
+                        # Get the tensors passed by forward.
+                        y, = ctx.saved_tensor()
+                        grad = dy * (1 - paddle.square(y))
+                        return grad
+        """
+
+        raise NotImplementedError(
+            "You must implement the backward function for PyLayer.")
diff --git a/python/paddle/callbacks.py b/python/paddle/callbacks.py
new file mode 100644
index 0000000000000000000000000000000000000000..08fab3e0adb5e559bb980500ad5ceebb59198281
--- /dev/null
+++ b/python/paddle/callbacks.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .hapi.callbacks import Callback  # noqa: F401
+from .hapi.callbacks import ProgBarLogger  # noqa: F401
+from .hapi.callbacks import ModelCheckpoint  # noqa: F401
+from .hapi.callbacks import VisualDL  # noqa: F401
+from .hapi.callbacks import LRScheduler  # noqa: F401
+from .hapi.callbacks import EarlyStopping  # noqa: F401
+from .hapi.callbacks import ReduceLROnPlateau  # noqa: F401
+
+__all__ = [  #noqa
+    'Callback',
+    'ProgBarLogger',
+    'ModelCheckpoint',
+    'VisualDL',
+    'LRScheduler',
+    'EarlyStopping',
+    'ReduceLROnPlateau'
+]
diff --git a/python/paddle/check_import_scipy.py b/python/paddle/check_import_scipy.py
index 0172d568e5b08693847495cde040054f96257785..d6e13e2a670856a1b4af288521dd9d7920747c42 100644
--- a/python/paddle/check_import_scipy.py
+++ b/python/paddle/check_import_scipy.py
@@ -24,6 +24,6 @@ def check_import_scipy(OsName):
             if 'DLL load failed' in print_info:
                 raise ImportError(
                     print_info +
-                    "\nplease download visual C++ Redistributable for vs 2015, https://www.microsoft.com/en-us/download/details.aspx?id=48145"
+                    "\nplease download Visual C++ Redistributable from https://support.microsoft.com/en-us/topic/the-latest-supported-visual-c-downloads-2647da03-1eea-4433-9aff-95f26a218cc0"
                 )
     return
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index 2db867d7a7acbf590c3e334fcabd5f43efd1474f..4b71ff6ac66f1e75f9aad0b49c6618b52c30c4a0 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -15,18 +15,18 @@
 Dataset package.
 """
 
-import paddle.dataset.mnist
-import paddle.dataset.imikolov
-import paddle.dataset.imdb
-import paddle.dataset.cifar
-import paddle.dataset.movielens
-import paddle.dataset.conll05
-import paddle.dataset.uci_housing
-import paddle.dataset.wmt14
-import paddle.dataset.wmt16
-import paddle.dataset.flowers
-import paddle.dataset.voc2012
-import paddle.dataset.image
+import paddle.dataset.mnist  # noqa: F401
+import paddle.dataset.imikolov  # noqa: F401
+import paddle.dataset.imdb  # noqa: F401
+import paddle.dataset.cifar  # noqa: F401
+import paddle.dataset.movielens  # noqa: F401
+import paddle.dataset.conll05  # noqa: F401
+import paddle.dataset.uci_housing  # noqa: F401
+import paddle.dataset.wmt14  # noqa: F401
+import paddle.dataset.wmt16  # noqa: F401
+import paddle.dataset.flowers  # noqa: F401
+import paddle.dataset.voc2012  # noqa: F401
+import paddle.dataset.image  # noqa: F401
 
 # set __all__ as empty for not showing APIs under paddle.dataset
 __all__ = []
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index 2ee95c3723b3ab771f9859d0b658ae0ee8f8291a..9a9f9018e421618b4699121b59f6620a240d53a5 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -37,7 +37,7 @@ import tarfile
 import six
 from six.moves import cPickle as pickle
 
-__all__ = ['train100', 'test100', 'train10', 'test10']
+__all__ = []
 
 URL_PREFIX = 'https://dataset.bj.bcebos.com/cifar/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
@@ -79,6 +79,7 @@ def reader_creator(filename, sub_name, cycle=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar100",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train100():
     """
@@ -98,6 +99,7 @@ def train100():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar100",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test100():
     """
@@ -117,6 +119,7 @@ def test100():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar10",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train10(cycle=False):
     """
@@ -139,6 +142,7 @@ def train10(cycle=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar10",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test10(cycle=False):
     """
@@ -161,6 +165,7 @@ def test10(cycle=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar10",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 2884fa0ce5e3d037fe2e929218da0aa52c1c0d8e..2a476f63862cfa2a41853d129bd6764df5292d3f 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -26,13 +26,7 @@ import paddle.dataset
 import six.moves.cPickle as pickle
 import glob
 
-__all__ = [
-    'DATA_HOME',
-    'download',
-    'md5file',
-    'split',
-    'cluster_files_reader',
-]
+__all__ = []
 
 HOME = os.path.expanduser('~')
 DATA_HOME = os.path.join(HOME, '.cache', 'paddle', 'dataset')
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index e7176626ca2d18cda5e61f267d2273b66b118adf..f09163ea424b0ea49d207cb4205a81aa34714f88 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -30,7 +30,7 @@ import paddle.compat as cpt
 import paddle.utils.deprecated as deprecated
 from six.moves import zip, range
 
-__all__ = ['test, get_dict', 'get_embedding']
+__all__ = []
 
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
@@ -206,6 +206,7 @@ def reader_creator(corpus_reader,
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Conll05st",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_dict():
     """
@@ -223,6 +224,7 @@ def get_dict():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Conll05st",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_embedding():
     """
@@ -234,6 +236,7 @@ def get_embedding():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Conll05st",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
@@ -258,6 +261,7 @@ def test():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Conll05st",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index e16ea6e561eae3b62f1d140f78d4b60abbaa08cc..2f38c563136d3bb2cd3d84b3c6971ca03fe3d302 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -35,7 +35,12 @@ import itertools
 import functools
 from .common import download
 import tarfile
-from paddle.dataset.image import *
+
+from paddle.dataset.image import load_image_bytes
+from paddle.dataset.image import load_image
+from paddle.dataset.image import simple_transform
+from paddle.dataset.image import batch_images_from_tar
+
 from paddle.reader import map_readers, xmap_readers
 from paddle import compat as cpt
 import paddle.utils.deprecated as deprecated
@@ -45,7 +50,8 @@ from multiprocessing import cpu_count
 import six
 from six.moves import cPickle as pickle
 from paddle.utils import try_import
-__all__ = ['train', 'test', 'valid']
+
+__all__ = []
 
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
 LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
@@ -150,6 +156,7 @@ def reader_creator(data_file,
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Flowers",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
@@ -183,6 +190,7 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Flowers",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
@@ -216,6 +224,7 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Flowers",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     '''
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 09b5607252bda6ffb1a410a1cb194e4a6394abe4..c20672c2ce1577a2f992c682710a6e61da947b45 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -58,11 +58,7 @@ import os
 import tarfile
 import six.moves.cPickle as pickle
 
-__all__ = [
-    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
-    "batch_images_from_tar"
-]
+__all__ = []
 
 
 def _check_cv2():
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index dab3c964cc6b73c7149d53b6899ded2effc2e97a..961d238b0ad41673975573a544ce1a088d7590e5 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -30,7 +30,7 @@ import re
 import string
 import six
 
-__all__ = ['build_dict', 'train', 'test']
+__all__ = []
 
 #URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 URL = 'https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz'
@@ -80,6 +80,7 @@ def build_dict(pattern, cutoff):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def reader_creator(pos_pattern, neg_pattern, word_idx):
     UNK = word_idx['<unk>']
@@ -102,6 +103,7 @@ def reader_creator(pos_pattern, neg_pattern, word_idx):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(word_idx):
     """
@@ -123,6 +125,7 @@ def train(word_idx):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(word_idx):
     """
@@ -144,6 +147,7 @@ def test(word_idx):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def word_dict():
     """
@@ -159,6 +163,7 @@ def word_dict():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index cc8e95fc342c28a12070582e27134ffa1800bd5d..85fe011fa143a2cb78fabc3d2acfb67a0035d75e 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -27,7 +27,7 @@ import collections
 import tarfile
 import six
 
-__all__ = ['train', 'test', 'build_dict']
+__all__ = []
 
 #URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
 URL = 'https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz'
@@ -115,6 +115,7 @@ def reader_creator(filename, word_idx, n, data_type):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imikolov",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(word_idx, n, data_type=DataType.NGRAM):
     """
@@ -139,6 +140,7 @@ def train(word_idx, n, data_type=DataType.NGRAM):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imikolov",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(word_idx, n, data_type=DataType.NGRAM):
     """
@@ -163,6 +165,7 @@ def test(word_idx, n, data_type=DataType.NGRAM):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imikolov",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 14e54d593bbe7eb351e1371d107e1144ed4df5f8..02cdd30708392927965f2781feb63f100d207742 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -26,7 +26,8 @@ import gzip
 import numpy
 import struct
 from six.moves import range
-__all__ = ['train', 'test']
+
+__all__ = []
 
 URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
 TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
@@ -92,6 +93,7 @@ def reader_creator(image_filename, label_filename, buffer_size):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.MNIST",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train():
     """
@@ -113,6 +115,7 @@ def train():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.MNIST",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
@@ -133,6 +136,7 @@ def test():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.MNIST",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index f753f405bba1f2671a4ac4acbcfb024bd417d6ca..9af06e088ca87efdcb012dfcef72edd26c43dd25 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -34,10 +34,7 @@ import functools
 import six
 import paddle.compat as cpt
 
-__all__ = [
-    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
-]
+__all__ = []
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
 
@@ -171,6 +168,7 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def __reader_creator__(**kwargs):
     return lambda: __reader__(**kwargs)
@@ -183,6 +181,7 @@ test = functools.partial(__reader_creator__, is_test=True)
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_movie_title_dict():
     """
@@ -202,6 +201,7 @@ def __max_index_info__(a, b):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def max_movie_id():
     """
@@ -214,6 +214,7 @@ def max_movie_id():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def max_user_id():
     """
@@ -233,6 +234,7 @@ def __max_job_id_impl__(a, b):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def max_job_id():
     """
@@ -246,6 +248,7 @@ def max_job_id():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def movie_categories():
     """
@@ -258,6 +261,7 @@ def movie_categories():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def user_info():
     """
@@ -270,6 +274,7 @@ def user_info():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def movie_info():
     """
@@ -291,6 +296,7 @@ def unittest():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, "movielens", MD5)
diff --git a/python/paddle/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
index 8e514f0fd9a18a7d512430111a8a11b942950d20..54dff6b40cf3c19bfbb1c8a6a1d6425c452e6746 100644
--- a/python/paddle/dataset/tests/cifar_test.py
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import paddle.dataset.cifar
 import unittest
 
+__all__ = []
+
 
 class TestCIFAR(unittest.TestCase):
     def check_reader(self, reader):
diff --git a/python/paddle/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
index 06a0a7761cfa10ca3211297d176e3e909332e271..256c116b7cff65b6eae08b244db9737e2db38cfa 100644
--- a/python/paddle/dataset/tests/flowers_test.py
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import paddle.dataset.flowers
 import unittest
 
+__all__ = []
+
 
 class TestFlowers(unittest.TestCase):
     def check_reader(self, reader):
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
index 613c5f8edb289ce5d9110adbedbe44d058eaf75d..264b0f232fa803aa8f7da33b1d1782b0973bcef1 100644
--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -18,6 +18,8 @@ import paddle.dataset.imdb
 import unittest
 import re
 
+__all__ = []
+
 TRAIN_POS_PATTERN = re.compile(r"aclImdb/train/pos/.*\.txt$")
 TRAIN_NEG_PATTERN = re.compile(r"aclImdb/train/neg/.*\.txt$")
 TRAIN_PATTERN = re.compile(r"aclImdb/train/.*\.txt$")
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
index 1f78a5dd4d1a09c3192bc8c144c5a78c8a214f3a..5556274211fc3334aa12a6c9b3d3926a5832a1e7 100644
--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -19,6 +19,8 @@ import unittest
 
 WORD_DICT = paddle.dataset.imikolov.build_dict()
 
+__all__ = []
+
 
 class TestMikolov(unittest.TestCase):
     def check_reader(self, reader, n):
diff --git a/python/paddle/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
index fbb5d926494e38283e78ec15381530e50f32915d..238b58244e147a30f47a0a54451c506142e57185 100644
--- a/python/paddle/dataset/tests/mnist_test.py
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import paddle.dataset.mnist
 import unittest
 
+__all__ = []
+
 
 class TestMNIST(unittest.TestCase):
     def check_reader(self, reader):
diff --git a/python/paddle/dataset/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
index 32d2eb17ae673e72bbee2fc3bb5e3b05f1b20074..259939d62f641359f3953c75c2d1015bcdd8edb3 100644
--- a/python/paddle/dataset/tests/test_image.py
+++ b/python/paddle/dataset/tests/test_image.py
@@ -19,6 +19,8 @@ import numpy as np
 
 import paddle.dataset.image as image
 
+__all__ = []
+
 
 class Image(unittest.TestCase):
     def test_resize_flip_chw(self):
diff --git a/python/paddle/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
index cddeb91cab2c0f90567f28f8258156e2bb654abc..21c24e6df823fbe0facab4e97cc255ea2745c64e 100644
--- a/python/paddle/dataset/tests/voc2012_test.py
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import paddle.dataset.voc2012
 import unittest
 
+__all__ = []
+
 
 class TestVOC(unittest.TestCase):
     def check_reader(self, reader):
diff --git a/python/paddle/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
index be121bb10121967590c9e136e9a1964a133e934b..68a9819c8f335002134129668c33991cb8bffc76 100644
--- a/python/paddle/dataset/tests/wmt16_test.py
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import paddle.dataset.wmt16
 import unittest
 
+__all__ = []
+
 
 class TestWMT16(unittest.TestCase):
     def checkout_one_sample(self, sample):
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index daed62fbefba18c34d120f1e95a706f7dcd34706..dea2dfc8c9818318725b2694c34d8f47c0bfbd52 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -29,7 +29,7 @@ import os
 import paddle.dataset.common
 import paddle.utils.deprecated as deprecated
 
-__all__ = ['train', 'test']
+__all__ = []
 
 URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
@@ -87,6 +87,7 @@ def load_data(filename, feature_num=14, ratio=0.8):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.UCIHousing",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train():
     """
@@ -111,6 +112,7 @@ def train():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.UCIHousing",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
@@ -146,6 +148,7 @@ def fluid_model():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.UCIHousing",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def predict_reader():
     """
@@ -162,6 +165,7 @@ def predict_reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.UCIHousing",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
index 5a0ff76aab4fe81883de1512bac4f91fcf1b7992..1ab91db2cc36d85ddad9e760f07c40a9634834d5 100644
--- a/python/paddle/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
@@ -25,11 +25,10 @@ import tarfile
 import io
 import numpy as np
 from paddle.dataset.common import download
-from paddle.dataset.image import *
 import paddle.utils.deprecated as deprecated
 from PIL import Image
 
-__all__ = ['train', 'test', 'val']
+__all__ = []
 
 VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
 VOCtrainval_11-May-2012.tar'
@@ -70,6 +69,7 @@ def reader_creator(filename, sub_name):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.VOC2012",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train():
     """
@@ -81,6 +81,7 @@ def train():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.VOC2012",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
@@ -92,6 +93,7 @@ def test():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.VOC2012",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def val():
     """
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index 3bd5e8d5bad46b74724f10307150fe33c86bd956..9f8abb2c4bfe9ecd135baef0dc2ada3827728c8b 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -30,11 +30,7 @@ import paddle.dataset.common
 import paddle.compat as cpt
 import paddle.utils.deprecated as deprecated
 
-__all__ = [
-    'train',
-    'test',
-    'get_dict',
-]
+__all__ = []
 
 URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
                 'cslm_joint_paper/data/dev+test.tgz')
@@ -118,6 +114,7 @@ def reader_creator(tar_file, file_name, dict_size):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(dict_size):
     """
@@ -138,6 +135,7 @@ def train(dict_size):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(dict_size):
     """
@@ -158,6 +156,7 @@ def test(dict_size):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def gen(dict_size):
     return reader_creator(
@@ -168,6 +167,7 @@ def gen(dict_size):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_dict(dict_size, reverse=True):
     # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
@@ -183,6 +183,7 @@ def get_dict(dict_size, reverse=True):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 7f11bc4b1f013b57892bf14c43d4338bd41c9391..f313da98f0abc253b02553d29bd5f75a04597867 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -40,13 +40,7 @@ import paddle
 import paddle.compat as cpt
 import paddle.utils.deprecated as deprecated
 
-__all__ = [
-    "train",
-    "test",
-    "validation",
-    "fetch",
-    "get_dict",
-]
+__all__ = []
 
 DATA_URL = ("http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz")
 DATA_MD5 = "0c38be43600334966403524a40dcd81e"
@@ -148,6 +142,7 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(src_dict_size, trg_dict_size, src_lang="en"):
     """
@@ -201,6 +196,7 @@ def train(src_dict_size, trg_dict_size, src_lang="en"):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(src_dict_size, trg_dict_size, src_lang="en"):
     """
@@ -254,6 +250,7 @@ def test(src_dict_size, trg_dict_size, src_lang="en"):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def validation(src_dict_size, trg_dict_size, src_lang="en"):
     """
@@ -305,6 +302,7 @@ def validation(src_dict_size, trg_dict_size, src_lang="en"):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_dict(lang, dict_size, reverse=False):
     """
@@ -339,6 +337,7 @@ def get_dict(lang, dict_size, reverse=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     """download the entire dataset.
diff --git a/python/paddle/device.py b/python/paddle/device.py
index 81b1dfcc745a4adadc68e3391c4b07a0d6cb4b0a..035d240e713fe8ff90a7fb40a1c5ad58d10bb4a3 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -32,12 +32,30 @@ __all__ = [
     #            'cuda_places',
     #            'CUDAPinnedPlace',
     #            'CUDAPlace',
-    'is_compiled_with_cuda'
+    'is_compiled_with_cuda',
+    'is_compiled_with_npu'
 ]
 
 _cudnn_version = None
 
 
+# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future
+# for consistent.
+def is_compiled_with_npu():
+    """
+    Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU.
+
+    Returns (bool): `True` if NPU is supported, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_npu = paddle.is_compiled_with_npu()
+    """
+    return core.is_compiled_with_npu()
+
+
 def is_compiled_with_xpu():
     """
     Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
@@ -101,28 +119,7 @@ def get_cudnn_version():
         return _cudnn_version
 
 
-def set_device(device):
-    """
-    Paddle supports running calculations on various types of devices, including CPU, GPU and XPU.
-    They are represented by string identifiers. This function can specify the global device
-    which the OP will run.
-
-    Parameters:
-        device(str): This parameter determines the specific running device.
-            It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
-            index of the GPUs or XPUs. 
-
-    Examples:
-
-     .. code-block:: python
-            
-        import paddle
-
-        paddle.set_device("cpu")
-        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
-        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
-        data = paddle.stack([x1,x2], axis=1)
-    """
+def _convert_to_place(device):
     lower_device = device.lower()
     if lower_device == 'cpu':
         place = core.CPUPlace()
@@ -165,6 +162,32 @@ def set_device(device):
             device_id = device_info_list[1]
             device_id = int(device_id)
             place = core.XPUPlace(device_id)
+    return place
+
+
+def set_device(device):
+    """
+    Paddle supports running calculations on various types of devices, including CPU, GPU and XPU.
+    They are represented by string identifiers. This function can specify the global device
+    which the OP will run.
+
+    Parameters:
+        device(str): This parameter determines the specific running device.
+            It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
+            index of the GPUs or XPUs. 
+
+    Examples:
+
+     .. code-block:: python
+            
+        import paddle
+
+        paddle.set_device("cpu")
+        x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
+        x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
+        data = paddle.stack([x1,x2], axis=1)
+    """
+    place = _convert_to_place(device)
     framework._set_expected_place(place)
     return place
 
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index c882e94d2bade81fd0d7db659bba97cfbbd39e2f..7427219285c200b423207b6dac5855816494717f 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -12,46 +12,62 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import spawn
-from .spawn import spawn
-
-from . import parallel
-from .parallel import init_parallel_env
-from .parallel import get_rank
-from .parallel import get_world_size
-from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
-from paddle.distributed.fleet.dataset import *
-
-from . import collective
-from .collective import *
-
-from .entry_attr import ProbabilityEntry
-from .entry_attr import CountFilterEntry
-
-# start multiprocess apis
-__all__ = ["spawn"]
-
-# dygraph parallel apis
-__all__ += [
-    "init_parallel_env",
-    "get_rank",
-    "get_world_size",
-    "ParallelEnv",
-    "InMemoryDataset",
-    "QueueDataset",
-]
+from .spawn import spawn  # noqa: F401
 
-# dataset reader
-__all__ += [
-    "InMemoryDataset",
-    "QueueDataset",
-]
+from .parallel import init_parallel_env  # noqa: F401
+from .parallel import get_rank  # noqa: F401
+from .parallel import get_world_size  # noqa: F401
 
-# entry for embedding
-__all__ += [
-    "ProbabilityEntry",
-    "CountFilterEntry",
-]
+from paddle.distributed.fleet.dataset import InMemoryDataset  # noqa: F401
+from paddle.distributed.fleet.dataset import QueueDataset  # noqa: F401
+
+from .collective import broadcast  # noqa: F401
+from .collective import all_reduce  # noqa: F401
+from .collective import reduce  # noqa: F401
+from .collective import all_gather  # noqa: F401
+from .collective import scatter  # noqa: F401
+from .collective import barrier  # noqa: F401
+from .collective import ReduceOp  # noqa: F401
+from .collective import split  # noqa: F401
+from .collective import new_group  # noqa: F401
+from .collective import alltoall  # noqa: F401
+from .collective import recv  # noqa: F401
+from .collective import get_group  # noqa: F401
+from .collective import send  # noqa: F401
+from .collective import wait  # noqa: F401
+
+from .fleet import BoxPSDataset  # noqa: F401
 
-# collective apis
-__all__ += collective.__all__
+from .entry_attr import ProbabilityEntry  # noqa: F401
+from .entry_attr import CountFilterEntry  # noqa: F401
+
+from paddle.fluid.dygraph.parallel import ParallelEnv  # noqa: F401
+
+from . import cloud_utils  # noqa: F401
+from . import utils  # noqa: F401
+
+__all__ = [     #noqa
+      "spawn",
+      "scatter",
+      "broadcast",
+      "ParallelEnv",
+      "new_group",
+      "init_parallel_env",
+      "QueueDataset",
+      "split",
+      "CountFilterEntry",
+      "get_world_size",
+      "get_group",
+      "all_gather",
+      "InMemoryDataset",
+      "barrier",
+      "all_reduce",
+      "alltoall",
+      "send",
+      "reduce",
+      "recv",
+      "ReduceOp",
+      "wait",
+      "get_rank",
+      "ProbabilityEntry"
+]
diff --git a/python/paddle/distributed/cloud_utils.py b/python/paddle/distributed/cloud_utils.py
index 962ba62b15f4a5feab115b793fd8674f08829d40..34e55bf164673f1c495c7eb9ccf468b28fbb0ee0 100644
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -14,7 +14,12 @@
 
 import os
 import paddle
-from paddle.distributed.utils import get_cluster, logger, get_gpus, get_cluster_from_args
+from paddle.distributed.utils import get_cluster
+from paddle.distributed.utils import logger
+from paddle.distributed.utils import get_gpus
+from paddle.distributed.utils import get_cluster_from_args
+
+__all__ = []
 
 
 def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_devices):
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index a6eb896802f112a3a7ac6d6eeb962c82a85ab7c6..ba4c3b09f9ff7fff6824cec9e15679330e28734e 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -15,8 +15,14 @@
 import numpy as np
 import os
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.framework import Variable
+from ..fluid.framework import OpProtoHolder
+from ..fluid.framework import in_dygraph_mode
+from ..fluid.framework import convert_np_dtype_to_dtype_
+from ..fluid.data_feeder import convert_dtype
+from ..fluid.data_feeder import check_variable_and_dtype
+from ..fluid.data_feeder import check_type
+from ..fluid.data_feeder import check_dtype
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
 from ..fluid.dygraph.parallel import prepare_context
@@ -25,16 +31,7 @@ from .fleet import fleet
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
-__all__ = [
-    'broadcast',
-    'all_reduce',
-    'reduce',
-    'all_gather',
-    'scatter',
-    'barrier',
-    'split',
-    'ReduceOp',
-]
+__all__ = []
 
 
 class ReduceOp:
@@ -75,30 +72,261 @@ class ReduceOp:
     PROD = 3
 
 
-class _Group():
-    """The abstract representation of group."""
+class Group():
+    """
+    The abstract representation of group.
+    """
 
-    def __init__(self, rank, rank_num):
+    def __init__(self, rank, rank_num, id=0, ranks=[]):
         self.rank = rank
         self.nranks = rank_num
+        self.id = id
+        self.ranks = ranks
+
+    def is_member(self):
+        if self.rank < 0:
+            return False
+        if self.nranks < 2:
+            return False
+        return True
+
+    def get_group_rank(self, rank):
+        if self.id == 0:
+            return rank
+        if self.is_member() and rank in self.ranks:
+            return self.ranks.index(rank)
+        else:
+            return -1
+
+
+_global_env = None
+
+
+def _get_global_env():
+    global _global_env
+    if not _global_env:
+        _global_env = paddle.distributed.ParallelEnv()
+    return _global_env
+
+
+# group map : the map of all group, 0 for GlobalGroup
+# Dict[int, Group]
+_group_map = {}
+
+
+def _get_group_map():
+    global _group_map
+    if not _group_map:
+        genv = _get_global_env()
+        _group_map[0] = Group(genv.rank, genv.world_size, 0)
+    return _group_map
+
+
+def _get_global_group():
+    return _get_group_map()[0]
+
+
+def _new_ring_id():
+    return len(_get_group_map()) + max(_get_global_env().nrings, 9)
+
+
+def get_group(id=0):
+    """
+
+    Get group instance by group id.
+
+    Args:
+        id (int): the group id. Default value is 0.
+
+    Returns:
+        Group: the group instance.
+
+    Examples:
+        .. code-block:: python
+
+            ...
+            gid = paddle.distributed.new_group([2,4,6])
+            paddle.distributed.get_group(gid.id)
+
+    """
+
+    gm = _get_group_map()
+    return gm[group] if group in gm else None
+
+
+def barrier(group=None):
+    """
+
+    Barrier among all participators in the group.
+
+    Args:
+        group (Group): The group instance return by new_group or None for global default group.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
 
+            import paddle
+            from paddle.distributed import init_parallel_env
 
-# NOTE(chenweihang): Lazily initialized global group information
-# If we initialize _default_group when import module, it will 
-# not update when we use spawn to run multi-process training 
-_default_group = None
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            paddle.distributed.barrier()
+    """
+    if group is not None and not group.is_member():
+        return
 
+    ring_id = 0 if group is None else group.id
 
-def _get_global_default_group():
-    global _default_group
-    if _default_group is None:
-        _default_group = _Group(
-            int(os.getenv("PADDLE_TRAINER_ID", "0")),
-            int(os.getenv("PADDLE_TRAINERS_NUM", "1")))
-    return _default_group
+    op_type = 'barrier'
+    temp = fill_constant([1], dtype="int32", value="1")
+    if in_dygraph_mode():
+        return core.ops.barrier(temp, temp, 'ring_id', ring_id)
+    if not isinstance(ring_id, int):
+        raise ValueError("The type of 'group' for barrier must be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [temp]},
+        attrs={'ring_id': ring_id})
 
 
-def broadcast(tensor, src, group=0):
+def new_group(ranks=None, backend=None):
+    """
+
+    Creates a new distributed communication group.
+
+    Args:
+        ranks (list): The global ranks of group members.
+        backend (str): The backend used to create group, only nccl is supported now.
+
+    Returns:
+        Group: The group instance.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.distributed.init_parallel_env()
+            tindata = paddle.randn(shape=[2, 3])
+            gp = paddle.distributed.new_group([2,4,6])
+            paddle.distributed.all_reduce(tindata, group=gp, use_calc_stream=False)
+
+    """
+
+    if not backend:
+        backend = 'nccl'
+    assert backend == 'nccl', ("backend other than nccl is not supported yet")
+
+    genv = _get_global_env()
+    global_rank = genv.rank
+
+    ring_id = _new_ring_id()
+
+    global _group_map
+    if global_rank not in ranks:
+        gp = Group(-1, -1, ring_id, ranks)
+        _group_map[ring_id] = gp
+        return gp
+
+    ranks = sorted(ranks)
+    group_rank = ranks.index(global_rank)
+    group_size = len(ranks)
+    gp = Group(group_rank, group_size, ring_id, ranks)
+    _group_map[ring_id] = gp
+
+    if group_size < 2:
+        return gp
+
+    strategy = core.ParallelStrategy()
+    strategy.nranks = group_size
+    strategy.local_rank = group_rank
+    strategy.trainer_endpoints = [genv.trainer_endpoints[i] for i in ranks]
+    strategy.current_endpoint = genv.current_endpoint
+    strategy.nrings = 1
+
+    if core.is_compiled_with_cuda():
+        place = core.CUDAPlace(genv.device_id)
+        core.NCCLParallelContext(strategy, place).init_with_ring_id(ring_id)
+    else:
+        assert False, ("no cuda device found")
+    # need to barrier to construct group
+    barrier(gp)
+    return gp
+
+
+def wait(tensor, group=None, use_calc_stream=True):
+    """
+
+    wait to sync stream for group.
+
+    Args:
+        tensor (Tensor): The Tensor used before sync.
+        group (Group): The Group instance to perform sync.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.distributed.init_parallel_env()
+            tindata = paddle.randn(shape=[2, 3])
+            paddle.distributed.all_reduce(tindata, use_calc_stream=True)
+            paddle.distributed.wait(tindata)
+
+    """
+
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+
+    if use_calc_stream:
+        _sync_calc_stream(tensor)
+    else:
+        _sync_comm_stream(tensor, ring_id)
+
+
+def _sync_calc_stream(tensor):
+
+    if in_dygraph_mode():
+        return core.ops.c_sync_calc_stream(tensor, tensor)
+
+    op_type = 'c_sync_calc_stream'
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]}, )
+
+
+def _sync_comm_stream(tensor, ring_id=0):
+
+    if in_dygraph_mode():
+        return core.ops.c_sync_comm_stream([tensor], [tensor], 'ring_id',
+                                           ring_id)
+
+    op_type = 'c_sync_comm_stream'
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={'ring_id': ring_id}, )
+
+
+def broadcast(tensor, src, group=None, use_calc_stream=True):
     """
 
     Broadcast a tensor from the source to all others.
@@ -107,7 +335,9 @@ def broadcast(tensor, src, group=0):
         tensor (Tensor): The Tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type
             should be float16, float32, float64, int32 or int64.
         src (int): The source rank.
-        group (int): The process group to work on. It is Optional.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -130,17 +360,26 @@ def broadcast(tensor, src, group=0):
             out = data.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
+
+    if group is not None and not group.is_member():
+        return
+
+    if not isinstance(src, int):
+        raise ValueError("src should be int.")
+
+    ring_id = 0 if group is None else group.id
+    gsrc = src if group is None else group.get_group_rank(src)
+    assert gsrc >= 0, ("src rank out of group, need global rank")
+
     if in_dygraph_mode():
-        return core.ops.c_broadcast(tensor, tensor, 'root', src,
-                                    'use_calc_stream', True, 'ring_id', group)
+        return core.ops.c_broadcast(tensor, tensor, 'root', gsrc,
+                                    'use_calc_stream', use_calc_stream,
+                                    'ring_id', ring_id)
 
     op_type = 'c_broadcast'
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         'broadcast')
-    if not isinstance(src, int) or not isinstance(group, int):
-        raise ValueError("Both the type of 'src' and 'group' for broadcast "
-                         "should be int.")
 
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
@@ -148,13 +387,13 @@ def broadcast(tensor, src, group=0):
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
         attrs={
-            'root': src,
-            'use_calc_stream': True,
-            'ring_id': group,
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
+            'ring_id': ring_id,
         })
 
 
-def all_reduce(tensor, op=ReduceOp.SUM, group=0):
+def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
     """
 
     Reduce a tensor over all ranks so that all get the result.
@@ -162,8 +401,10 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
     Args:
         tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
             should be float16, float32, float64, int32 or int64.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
-        group (int): Optional. The process group to work on.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -187,21 +428,26 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
             out = data.numpy()
             # [[5, 7, 9], [5, 7, 9]]
     """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
-            return core.ops.c_allreduce_sum(tensor, tensor, 'use_calc_stream',
-                                            True, 'ring_id', group)
+            return core.ops.c_allreduce_sum_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MAX:
-            return core.ops.c_allreduce_max(tensor, tensor, 'use_calc_stream',
-                                            True, 'ring_id', group)
+            return core.ops.c_allreduce_max_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MIN:
-            return core.ops.c_allreduce_min(tensor, tensor, 'use_calc_stream',
-                                            True, 'ring_id', group)
+            return core.ops.c_allreduce_min_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.PROD:
-            return core.ops.c_allreduce_prod(tensor, tensor, 'use_calc_stream',
-                                             True, 'ring_id', group)
+            return core.ops.c_allreduce_prod_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
+        return out
 
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
@@ -217,18 +463,18 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
         op_type = 'c_allreduce_min'
     elif op == ReduceOp.PROD:
         op_type = 'c_allreduce_prod'
-    if not isinstance(group, int):
-        raise ValueError("The type of 'group' for all_reduce should be int.")
+    if not isinstance(ring_id, int):
+        raise ValueError("The type of 'ring_id' for all_reduce should be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
-        attrs={'ring_id': group,
-               'use_calc_stream': True})
+        attrs={'ring_id': ring_id,
+               'use_calc_stream': use_calc_stream})
 
 
-def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
+def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
     """
 
     Reduce a tensor to the destination from all others.
@@ -237,8 +483,10 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
         tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
             should be float16, float32, float64, int32 or int64.
         dst (int): The destination rank id.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
-        group (int): The id of the process group to work on.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -261,20 +509,33 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
             out = data.numpy()
             # [[5, 7, 9], [5, 7, 9]]
     """
+    if group is not None and not group.is_member():
+        return
+
+    if not isinstance(dst, int):
+        raise ValueError("dst should be int.")
+
+    ring_id = 0 if group is None else group.id
+    gdst = dst if group is None else group.get_group_rank(dst)
+    assert gdst >= 0, ("dst rank out of group, need global rank")
+
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
             return core.ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
-                                         True, 'ring_id', group, 'root_id', dst)
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'root_id', gdst)
         elif op == ReduceOp.MAX:
             return core.ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
-                                         True, 'ring_id', group, 'root_id', dst)
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'root_id', gdst)
         elif op == ReduceOp.MIN:
             return core.ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
-                                         True, 'ring_id', group, 'root_id', dst)
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'root_id', gdst)
         elif op == ReduceOp.PROD:
             return core.ops.c_reduce_prod(tensor, tensor, 'use_calc_stream',
-                                          True, 'ring_id', group, 'root_id',
-                                          dst)
+                                          use_calc_stream, 'ring_id', ring_id,
+                                          'root_id', gdst)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
@@ -295,22 +556,19 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
     elif op == ReduceOp.PROD:
         op_type = 'c_reduce_prod'
 
-    if not isinstance(dst, int) or not isinstance(group, int):
-        raise ValueError("Both the type of 'dst' and 'group' for reduce "
-                         "should be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
         attrs={
-            'ring_id': group,
-            'use_calc_stream': True,
-            'root_id': dst,
+            'ring_id': ring_id,
+            'use_calc_stream': use_calc_stream,
+            'root_id': gdst,
         })
 
 
-def all_gather(tensor_list, tensor, group=0):
+def all_gather(tensor_list, tensor, group=None, use_calc_stream=True):
     """
 
     Gather tensors from all participators and all get the result.
@@ -320,7 +578,9 @@ def all_gather(tensor_list, tensor, group=0):
             should be float16, float32, float64, int32 or int64.
         tensor (Tensor): The Tensor to send. Its data type
             should be float16, float32, float64, int32 or int64.
-        group (int): The id of the process group to work on.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -348,13 +608,19 @@ def all_gather(tensor_list, tensor, group=0):
                 data2 = paddle.to_tensor(np_data2)
                 paddle.distributed.all_gather(tensor_list, data2)
     """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+    nranks = _get_global_group().nranks if group is None else group.nranks
+
     op_type = 'c_allgather'
     helper = LayerHelper(op_type, **locals())
     out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-    _default_group = _get_global_default_group()
+
     if in_dygraph_mode():
-        core.ops.c_allgather(tensor, out, 'use_calc_stream', True, 'ring_id',
-                             group, 'nranks', _default_group.nranks)
+        core.ops.c_allgather(tensor, out, 'use_calc_stream', use_calc_stream,
+                             'ring_id', ring_id, 'nranks', nranks)
     else:
         if not isinstance(tensor_list, list):
             raise ValueError("The type of 'tensor_list' for all_gather "
@@ -367,23 +633,20 @@ def all_gather(tensor_list, tensor, group=0):
         check_variable_and_dtype(
             tensor, 'tensor',
             ['float16', 'float32', 'float64', 'int32', 'int64'], 'all_gather')
-        if not isinstance(group, int):
-            raise ValueError("The type of 'group' for all_gather "
-                             "should be int.")
         helper.append_op(
             type=op_type,
             inputs={'X': [tensor]},
             outputs={'Out': [out]},
             attrs={
-                'ring_id': group,
-                'use_calc_stream': True,
-                'nranks': _default_group.nranks
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+                'nranks': nranks
             })
 
-    tensor_list.extend(paddle.split(out, _default_group.nranks, 0))
+    tensor_list.extend(paddle.split(out, nranks, 0))
 
 
-def scatter(tensor, tensor_list=None, src=0, group=0):
+def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
     """
 
     Scatter a tensor to all participators.
@@ -391,10 +654,12 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
     Args:
         tensor (Tensor): The output Tensor. Its data type
             should be float16, float32, float64, int32 or int64.
-        tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32 or int64.
-        src (int): The source rank id.
-        group (int): The id of the process group to work on.
+        tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
+            should be float16, float32, float64, int32 or int64. Default value is None.
+        src (int): The source rank id. Default value is 0.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -406,6 +671,8 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
             import paddle
             from paddle.distributed import init_parallel_env
 
+            # required: gpu
+
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             if paddle.distributed.ParallelEnv().local_rank == 0:
@@ -422,82 +689,217 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
                 paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
             out = data1.numpy()
     """
+    if group is not None and not group.is_member():
+        return
+
+    if not isinstance(src, int):
+        raise ValueError("src should be int.")
+
+    ring_id = 0 if group is None else group.id
+    gsrc = src if group is None else group.get_group_rank(src)
+    assert gsrc >= 0, ("src rank out of group, need global rank")
+    rank = _get_global_group().rank if group is None else group.rank
+    nranks = _get_global_group().nranks if group is None else group.nranks
+
     op_type = 'c_scatter'
-    _default_group = _get_global_default_group()
-    rank = _default_group.rank
-    nranks = _default_group.nranks
-    if rank != src:
+
+    if rank != gsrc:
         tensor_list = []
         for _ in range(nranks):
             tensor_list.append(tensor)
     temp = paddle.concat(tensor_list, axis=0)
     if in_dygraph_mode():
-        return core.ops.c_scatter(temp, tensor, 'use_calc_stream', True,
-                                  'ring_id', group, 'nranks',
-                                  _default_group.nranks, 'root', src)
+        return core.ops.c_scatter(temp, tensor, 'use_calc_stream',
+                                  use_calc_stream, 'ring_id', ring_id, 'nranks',
+                                  nranks, 'root', gsrc)
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         'scatter')
-    if not isinstance(group, int) or not isinstance(src, int):
-        raise ValueError("Both the type of 'src' and 'group' for scatter "
-                         "should be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [temp]},
         outputs={'Out': [tensor]},
         attrs={
-            'ring_id': group,
-            'root': src,
-            'use_calc_stream': True,
+            'ring_id': ring_id,
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
             'nranks': nranks,
         })
 
 
-def barrier(group=0):
+def _c_identity(tensor, group=None):
     """
+    Return a copy of the tensor, mainly used with model parallel.
 
-    Barrier among all participators in the group.
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if in_dygraph_mode():
+        return core.ops.c_identity(tensor, 'use_calc_stream', True, 'ring_id',
+                                   ring_id, 'use_model_parallel', True)
+    op_type = 'c_identity'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_identity')
+
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'use_model_parallel': True,
+        })
+    return out
+
+
+def _c_concat(tensor, nranks, group=None):
+    """
+    Return allgather of the tensor, mainly used with model parallel.
 
     Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
         group (int): The id of the process group to work on.
 
     Returns:
-        None.
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
 
-    Examples:
-        .. code-block:: python
+    if in_dygraph_mode():
+        return core.ops.c_concat(tensor, 'ring_id', ring_id, 'use_calc_stream',
+                                 True, 'nranks', nranks, 'use_model_parallel',
+                                 True)
 
-            import paddle
-            from paddle.distributed import init_parallel_env
+    op_type = 'c_concat'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
 
-            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
-            init_parallel_env()
-            paddle.distributed.barrier()
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_concat')
+
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'use_model_parallel': True,
+            'nranks': nranks
+        })
+    return out
+
+
+def _c_split(tensor, rank, nranks, group=None):
     """
-    op_type = 'barrier'
-    temp = fill_constant([1], dtype="int32", value="1")
+    Split tensor evenly among all members, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        rank (int): The rank of the current process.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
     if in_dygraph_mode():
-        return core.ops.barrier(temp, temp, 'ring_id', group)
-    if not isinstance(group, int):
-        raise ValueError("The type of 'group' for barrier must be int.")
+        return core.ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
+                                ring_id, 'rank', rank, 'nranks', nranks,
+                                'use_model_parallel', True)
+
+    op_type = 'c_split'
     helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_split')
+
     helper.append_op(
         type=op_type,
-        inputs={'X': [temp]},
-        outputs={'Out': [temp]},
-        attrs={'ring_id': group})
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'rank': rank,
+            'nranks': nranks,
+            'use_model_parallel': True,
+        })
+    return out
 
 
-def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
-                     gather_out, inner_rank, name):
+def _mp_allreduce(tensor,
+                  op=ReduceOp.SUM,
+                  group=None,
+                  use_calc_stream=True,
+                  use_model_parallel=True):
+    """[it is same as allreduce above, but it suuports model parallel. And it support inplace startegy]
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if in_dygraph_mode():
+        if op == ReduceOp.SUM:
+            return core.ops.c_allreduce_sum_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id,
+                "use_model_parallel", use_model_parallel)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+    else:
+        raise NotImplementedError("No support _mp_allreduce in dygraph mode.")
+
+
+def _parallel_linear(x,
+                     num_rows,
+                     num_cols,
+                     axis,
+                     param_attr,
+                     bias_attr,
+                     gather_out,
+                     inner_rank,
+                     nranks,
+                     split_tensor,
+                     name,
+                     group=None):
     """
     Parallel Linear
     """
-    if not name:
-        name = "fc_by_row_rank_%d" % inner_rank if axis == 0 else "fc_by_col_rank_%d" % inner_rank
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if axis == 0:
+        if split_tensor:
+            x = _c_split(x, inner_rank, nranks, group=group)
     else:
-        name = name + "_by_row_rank_%d" % inner_rank if axis == 0 else name + "_by_col_rank_%d" % inner_rank
+        x = _c_identity(x, group=group)
+
     linear = paddle.nn.Linear(
         num_rows,
         num_cols,
@@ -505,33 +907,63 @@ def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
         bias_attr=bias_attr,
         name=name)
 
-    weight = linear.weight
-    weight.is_distributed = True
     linear_out = linear(x)
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[weight.name].is_distributed = True
-    main_block.vars[weight.name].is_distributed = True
-
-    if gather_out:
-        if axis == 0:
-            paddle.distributed.all_reduce(linear_out, group=0)
-        else:
-            output = []
-            paddle.distributed.all_gather(output, linear_out, group=0)
-            linear_out = paddle.concat(output, axis=len(linear_out.shape) - 1)
-    return linear_out
+    startup_block.vars[linear.weight.name].is_distributed = True
+    main_block.vars[linear.weight.name].is_distributed = True
+
+    if not gather_out: return linear_out
+
+    op_type = 'c_allreduce_sum' if axis == 0 else 'c_concat'
+    out_shape = list(linear_out.shape)
+    out_shape[0] *= 1 if axis == 0 else nranks
+    out = main_block.create_var(
+        shape=out_shape,
+        dtype=linear_out.dtype,
+        type=linear_out.type,
+        lod_level=linear_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=linear_out.desc.need_check_feed())
+    if axis == 0:
+        main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': linear_out},
+            outputs={'Out': out},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+    else:
+        main_block.append_op(
+            type='c_concat',
+            inputs={'X': linear_out},
+            outputs={'Out': out},
+            attrs={
+                'ring_id': ring_id,
+                'nranks': nranks,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+    return out
 
 
-def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
-                        inner_rank, num_partitions, name):
+def _parallel_embedding(x,
+                        per_part_embeddings,
+                        origin_size,
+                        param_attr,
+                        inner_rank,
+                        num_partitions,
+                        name,
+                        group=None):
     """
     Parallel Embedding
     """
-    if not name:
-        name = "emb_rank_%d" % inner_rank
-    else:
-        name = name + "_rank_%d" % inner_rank
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
 
     origin_num_embeddings = origin_size[0]
     embedding = paddle.nn.Embedding(
@@ -552,15 +984,29 @@ def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
                                  inner_rank, per_part_embeddings - 1)
     if len(origin_input_shape) == 2:
         x_shard = paddle.squeeze(x_shard, axis=-1)
-
-    embedding.weight.is_distributed = True
     emb_out = embedding(x_shard)
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
     startup_block.vars[embedding.weight.name].is_distributed = True
     main_block.vars[embedding.weight.name].is_distributed = True
-    paddle.distributed.all_reduce(emb_out, group=0)
-    return emb_out
+    out = main_block.create_var(
+        shape=emb_out.shape,
+        dtype=emb_out.dtype,
+        type=emb_out.type,
+        lod_level=emb_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=emb_out.desc.need_check_feed())
+    main_block.append_op(
+        type='c_allreduce_sum',
+        inputs={'X': emb_out},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'use_model_parallel': True
+        })
+    return out
 
 
 def split(x,
@@ -584,7 +1030,7 @@ def split(x,
         With parallel embedding, the weight is split into num_partitions partitions, each
         of which is a matrix with (N/num_partitions + 1) rows and M column where the last
         row as the padding idx.
-        
+
         Suppose we split the NxM weight into two partitons on device_0 and device_1
         respectively. Then, one each device, the final weight has (N/2 + 1) rows with the
         index range from 0 to N/2. On device_0, all values in the input within [0, N/2 -1]
@@ -628,10 +1074,12 @@ def split(x,
             import paddle
             from paddle.distributed import init_parallel_env
 
+            # required: gpu
+
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             data = paddle.randint(0, 8, shape=[10,4])
-            emb_out = padle.distributed.split(
+            emb_out = paddle.distributed.split(
                 data,
                 (8, 8),
                 operation="embedding",
@@ -653,8 +1101,10 @@ def split(x,
         "paddle.distributed.split must be one of {}.".format(
             supported_operations))
     if in_dygraph_mode():
-        rank = paddle.distributed.get_rank()
-        nranks = paddle.distributed.get_world_size()
+        raise ValueError(
+            "paddle.distributed.split cannot be used in dynamic "
+            "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
+            "ParallelColumnLinear instead.")
     else:
         assert fleet._role_maker, ("To use paddle.distributed.split, "
                                    "you must call fleet.init() firstly.")
@@ -672,10 +1122,18 @@ def split(x,
         if inner_rank == num_partitions - 1: per_part_size = last_part_size
         per_part_size += 1  # make the last row as the padding index
 
-        emb_out = _parallel_embedding(x, per_part_size, size, weight_attr,
-                                      inner_rank, num_partitions, name)
+        emb_out = _parallel_embedding(
+            x,
+            per_part_size,
+            size,
+            weight_attr,
+            inner_rank,
+            num_partitions,
+            name,
+            group=None)
         return emb_out
     else:
+        should_split = False
         if axis == 0:
             assert size[0] % num_partitions == 0, (
                 "Number of rows of the weight for linear ({}) must be"
@@ -683,11 +1141,7 @@ def split(x,
                                                            num_partitions))
             per_part_size = size[0] // num_partitions
             linear_size = (per_part_size, size[1])
-            assert x.shape[-1] == per_part_size, (
-                "The width ({}) of the input "
-                "x must be equal to the height ({}) of the weight. Maybe you "
-                "should split the input x using paddle.split.".format(
-                    x.shape[-1], per_part_size))
+            if x.shape[-1] == size[0]: should_split = True
 
         elif axis == 1:
             assert size[1] % num_partitions == 0, (
@@ -709,5 +1163,181 @@ def split(x,
             bias_attr,
             gather_out,
             inner_rank,
-            name=name)
+            num_partitions,
+            should_split,
+            name=name,
+            group=None)
         return linear_out
+
+
+def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
+    """
+    Scatter tensors in in_tensor_list to all participators and gather the result tensors in out_tensor_list.
+    Args:
+        in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type
+            should be float16, float32, float64, int32 or int64.
+        out_tensor_list (Tensor): A list of output Tensors. The data type of its elements should be the same as the
+            data type of the input Tensors.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
+    Returns:
+        None.
+    Examples:
+        .. code-block:: python
+            # required: distributed
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+            init_parallel_env()
+            out_tensor_list = []
+            if paddle.distributed.ParallelEnv().rank == 0:
+                np_data1 = np.array([[1, 2, 3], [4, 5, 6]])
+                np_data2 = np.array([[7, 8, 9], [10, 11, 12]])
+            else:
+                np_data1 = np.array([[13, 14, 15], [16, 17, 18]])
+                np_data2 = np.array([[19, 20, 21], [22, 23, 24]])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            paddle.distributed.all_to_all([data1, data2], out_tensor_list)
+            # out for rank 0: [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]]
+            # out for rank 1: [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]]
+    """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+    op_type = 'alltoall'
+    temp = paddle.concat(in_tensor_list, axis=0)
+    helper = LayerHelper(op_type, **locals())
+    nranks = len(in_tensor_list)
+    out = helper.create_variable_for_type_inference(
+        dtype=in_tensor_list[0].dtype)
+    if in_dygraph_mode():
+        core.ops.alltoall_(temp, 'use_calc_stream', use_calc_stream, 'ring_id',
+                           ring_id)
+    else:
+        if not isinstance(in_tensor_list, list):
+            raise ValueError("The type of 'in_tensor_list' for all_to_all "
+                             "should be list.")
+        for elem in in_tensor_list:
+            check_variable_and_dtype(
+                elem, 'in_tensor_list',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                'all_to_all')
+        if not isinstance(out_tensor_list, list):
+            raise ValueError("The type of 'out_tensor_list' for all_to_all "
+                             "should be list.")
+        if len(out_tensor_list) != 0:
+            raise ValueError("The 'out_tensor_list' for all_to_all "
+                             "must be an empty list.")
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [temp]},
+            outputs={'Out': [out]},
+            attrs={
+                'ring_id': group,
+                'use_calc_stream': use_calc_stream,
+            })
+    out_tensor_list.extend(paddle.split(out, nranks, 0))
+
+
+def send(tensor, dst=0, group=None, use_calc_stream=True):
+    """
+    Send a tensor to the receiver.
+
+    Args:
+        tensor (Tensor): The Tensor to send. Its data type
+            should be float16, float32, float64, int32 or int64.
+        dst (int): The destination rank id.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+        use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+            # required: distributed
+            import paddle
+            from paddle.distributed import init_parallel_env
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().rank == 0:
+                data = paddle.to_tensor([7, 8, 9])
+                paddle.distributed.send(data, dst=1)
+            else:
+                data = paddle.to_tensor([1,2,3])
+                paddle.distributed.recv(data, src=0)
+            out = data.numpy()
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    op_type = 'send_v2'
+    if in_dygraph_mode():
+        return core.ops.send_v2(tensor, 'use_calc_stream', use_calc_stream,
+                                'ring_id', ring_id, 'peer', dst)
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'send')
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'peer': dst,
+            'use_calc_stream': use_calc_stream,
+        })
+
+
+def recv(tensor, src=0, group=None, use_calc_stream=True):
+    """
+    Receive a tensor to the sender.
+
+    Args:
+        tensor (Tensor): The Tensor to receive. Its data type
+            should be float16, float32, float64, int32 or int64.
+        src (int): The source rank id.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+        use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+            # required: distributed
+            import paddle
+            from paddle.distributed import init_parallel_env
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().rank == 0:
+                data = paddle.to_tensor([7, 8, 9])
+                paddle.distributed.send(data, dst=1)
+            else:
+                data = paddle.to_tensor([1,2,3])
+                paddle.distributed.recv(data, src=0)
+            out = data.numpy()
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    op_type = 'recv_v2'
+    if in_dygraph_mode():
+        return core.ops.recv_v2(tensor, 'use_calc_stream', use_calc_stream,
+                                'ring_id', ring_id, 'peer', src, 'dtype',
+                                tensor.dtype, 'out_shape', tensor.shape)
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'recv')
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'peer': src,
+            'out_shape': tensor.shape,
+            'dtype': tensor.dtype,
+            'use_calc_stream': use_calc_stream,
+        })
diff --git a/python/paddle/distributed/entry_attr.py b/python/paddle/distributed/entry_attr.py
index dbd899952af03f6e3184c77631916e13f549fd66..e219ef6434a3f117627630a8cc81fdb70d3eb7d2 100644
--- a/python/paddle/distributed/entry_attr.py
+++ b/python/paddle/distributed/entry_attr.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-__all__ = ['ProbabilityEntry', 'CountFilterEntry']
+__all__ = []
 
 
 class EntryAttr(object):
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index bd8492ecfa7ee75307b9ef70446271209a2ffb69..5f9a61371d34f4026bd1c6b8bef4fc8c626eb847 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -13,23 +13,34 @@
 # limitations under the License.
 
 # TODO: define distributed api under this directory,
-from .base.role_maker import Role, UserDefinedRoleMaker, PaddleCloudRoleMaker
-from .base.distributed_strategy import DistributedStrategy
-from .base.fleet_base import Fleet
-from .base.util_factory import UtilBase
-from .dataset import *
-from .data_generator import MultiSlotDataGenerator, MultiSlotStringDataGenerator
-from . import metrics
+from .base.role_maker import Role  # noqa: F401
+from .base.role_maker import UserDefinedRoleMaker  # noqa: F401
+from .base.role_maker import PaddleCloudRoleMaker  # noqa: F401
+from .base.distributed_strategy import DistributedStrategy  # noqa: F401
+from .base.fleet_base import Fleet  # noqa: F401
+from .base.util_factory import UtilBase  # noqa: F401
+from .dataset import DatasetBase  # noqa: F401
+from .dataset import InMemoryDataset  # noqa: F401
+from .dataset import QueueDataset  # noqa: F401
+from .dataset import FileInstantDataset  # noqa: F401
+from .dataset import BoxPSDataset  # noqa: F401
+from .data_generator.data_generator import MultiSlotDataGenerator  # noqa: F401
+from .data_generator.data_generator import MultiSlotStringDataGenerator  # noqa: F401
+from . import metrics  # noqa: F401
+from .base.topology import CommunicateTopology
+from .base.topology import HybridCommunicateGroup  # noqa: F401
 
-__all__ = [
-    "DistributedStrategy",
-    "UtilBase",
-    "UserDefinedRoleMaker",
-    "PaddleCloudRoleMaker",
-    "Fleet",
-    "MultiSlotDataGenerator",
-    "MultiSlotStringDataGenerator",
-    "Role",
+__all__ = [ #noqa
+      "CommunicateTopology",
+      "UtilBase",
+      "HybridCommunicateGroup",
+      "MultiSlotStringDataGenerator",
+      "UserDefinedRoleMaker",
+      "DistributedStrategy",
+      "Role",
+      "MultiSlotDataGenerator",
+      "PaddleCloudRoleMaker",
+      "Fleet"
 ]
 
 fleet = Fleet()
@@ -40,6 +51,17 @@ init = fleet.init
 is_first_worker = fleet.is_first_worker
 worker_index = fleet.worker_index
 worker_num = fleet.worker_num
+node_num = fleet.node_num
+rank = fleet.worker_index
+nranks = fleet.worker_num
+world_size = fleet.worker_num
+# device id in current trainer
+local_device_ids = fleet.local_device_ids
+# device ids in world
+world_device_ids = fleet.world_device_ids
+# rank in node
+local_rank = fleet.local_rank
+rank_in_node = local_rank
 is_worker = fleet.is_worker
 worker_endpoints = fleet.worker_endpoints
 server_num = fleet.server_num
@@ -64,3 +86,5 @@ get_lr = fleet.get_lr
 state_dict = fleet.state_dict
 set_state_dict = fleet.set_state_dict
 shrink = fleet.shrink
+get_hybrid_communicate_group = fleet.get_hybrid_communicate_group
+distributed_scaler = fleet.distributed_scaler
diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..708c76ac55abe83fcb5553bd769fba4c9f579882
--- /dev/null
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import paddle
+from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode
+
+__all__ = []
+
+
+def _get_ascend_rankfile(rank_table_file_path):
+    """
+    Args:
+    rank_table_file_path: ascend npu rank file json
+    {
+        "status": "completed",
+        "version": "1.0",
+        "server_count": "2",
+        "server_list": [
+            {
+                "server_id": "192.168.24.217",
+                "device": [
+                    {
+                        "device_id": "0",
+                        "device_ip": "192.1.184.23",
+                        "rank_id": "0"
+                    },
+                    {
+                        "device_id": "1",
+                        "device_ip": "192.2.21.93",
+                        "rank_id": "1"
+                    }
+                ]
+            },
+            {
+                "server_id": "192.168.26.177",
+                "device": [
+                    {
+                        "device_id": "0",
+                        "device_ip": "192.1.94.132",
+                        "rank_id": "2"
+                    },
+                    {
+                        "device_id": "1",
+                        "device_ip": "192.2.94.30",
+                        "rank_id": "3"
+                    }
+                ]
+            }
+        ]
+    }
+
+    Returns:
+        node_ips: node ip list
+        device_count: number of npu per machine
+    """
+    json_data = None
+    with open(rank_table_file_path) as json_file:
+        json_data = json.load(json_file)
+
+    node_ips = []
+    device_count = 0
+    server_list = json_data['server_list']
+    for server in server_list:
+        node_ips.append(server['server_id'])
+        device_list = server['device']
+        device_count = len(device_list)
+
+    return node_ips, device_count
+
+
+def get_cloud_cluster(rank_table_file=None,
+                      device_mode=DeviceMode.ASCEND_NPU,
+                      start_port=6070):
+    """
+    Args:
+    rank_table_file: string, ascend npu rank file path
+    device_mode: DeviceMode(Int)
+    start_port: the start port of current runtime env
+    """
+    if rank_table_file:
+        # multi trainers
+        node_ips, device_count = _get_ascend_rankfile(rank_table_file)
+        if len(node_ips) == 1:
+            node_ip = node_ips[0]
+        else:
+            node_index = os.environ.get("PADDLE_TRAINER_ID")
+            node_ip = None
+            if node_index:
+                node_ip = node_ips[int(node_index)]
+            else:
+                _, node_ip = get_host_name_ip()
+
+        assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \
+            % (node_ip, node_ips)
+    else:
+        # single trainer (single ascend card)
+        node_ips = ["127.0.0.1"]
+        node_ip = node_ips[0]
+        device_count = 1
+
+    devices_per_proc = [str(x) for x in range(device_count)]
+    free_ports = [
+        x for x in range(start_port, start_port + len(devices_per_proc))
+    ]
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+
+    return get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
+                       devices_per_proc)
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index f79013d7347c00efc36aef17dba5f6d3a1ae3165..122ef4357af72619f562ce879ce2970891d961d0 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -19,7 +19,7 @@ from paddle.fluid.wrapped_decorator import wrap_decorator
 import google.protobuf.text_format
 import google.protobuf
 
-__all__ = ["DistributedStrategy"]
+__all__ = []
 
 non_auto_func_called = True
 
@@ -620,6 +620,34 @@ class DistributedStrategy(object):
         else:
             raise ValueError("last_comm_group_size_MB should be greater than 0")
 
+    @property
+    def find_unused_parameters(self):
+        """
+        Indicating whether we are using find_unused_parameters to 
+        find unused parameters in DataParallel.
+
+        Default value: False
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.find_unused_parameters = True
+        """
+
+        return self.strategy.find_unused_parameters
+
+    @find_unused_parameters.setter
+    @is_strict_auto
+    def find_unused_parameters(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.find_unused_parameters = flag
+        else:
+            print(
+                "WARNING: find_unused_parameters should have value of bool type")
+
     @property
     def _fuse_grad_size_in_TFLOPS(self):
         return self.strategy.fuse_grad_size_in_TFLOPS
@@ -716,6 +744,8 @@ class DistributedStrategy(object):
         idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
         Model parameters and Optimizer State are sharded into different ranks allowing to fit larger model.
 
+        In Hybrid parallelism scenario, we use sharding config as uniform API to set each parallelism.
+
         Default value: False
 
         Examples:
@@ -742,29 +772,51 @@ class DistributedStrategy(object):
         Set sharding configurations. 
 
         **Note**:
-            fuse_broadcast_MB(float): size of a fused group of broadcasted parameters. 
-            This configuration will affect the communication speed in sharding training, 
-            and should be an empirical value decided by your model size and network topology.
+            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are 
+            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and 
+            communication. Default is segment_broadcast_MB.
+
+            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and 
+            after every segment_broadcast_MB size parameter being broadcasted, the program will be cutted into one segment.
+            This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
+            Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .
+
+            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation. 
+            this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.
+
+            sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.
 
-            hybrid_dp(bool): enable hybrid data parallelism above the sharding parallelism. 
-            you are supposed to have at least double the number of gpu you have in normal sharding 
-            training to enable this feature.
+            gradient_merge_acc_step(int, optional): specific the accumulation steps in gradient merge; and gradient merge will be turn off if gradient_merge_acc_step=1.  Default is 1.
+
+            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model. 
+            the moment var will be prefetch from and offloaded to Host memory during update stage. it is a stragtegy that trades off between training speed and GPU memory, and is recommened to be turn on only when gradient_merge_acc_step large, where
+            the number of time of update stage will be relatively small compared with forward&backward's.  Default is False.
+
+            dp_degree(int, optional): specific the number of data parallelism group; when dp_degree >= 2, it will introduce dp_degree ways data parallelism as the outer parallelsim for the inner parallelsim. User is responsible to ensure global_world_size = mp_degree * sharding_degree * pp_degree * dp_degree. Default is 1.
+
+            mp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.
+
+            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
+
+            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
+            This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.
 
-            sharding_group_size(int): attribute of hybrid_dp. specific the the number of gpus within
-            each sharding group; and therefore, the number of hybrid data parallelism ways will be equal
-            to (global_size / sharding_group_size).
 
         Examples:
 
           .. code-block:: python
 
+            # sharding-DP, 2 nodes with 8 gpus per node
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.sharding = True
             strategy.sharding_configs = {
-                "fuse_broadcast_MB": 32,
-                "hybrid_dp": True,
-                "sharding_group_size": 8}
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 32,
+                "sharding_degree": 8,
+                "sharding_degree": 2,
+                "gradient_merge_acc_step": 4,
+                }
         """
         return get_msg_dict(self.strategy.sharding_configs)
 
@@ -775,6 +827,32 @@ class DistributedStrategy(object):
                           "sharding_configs")
         assign_configs_value(self.strategy.sharding_configs, configs)
 
+    @property
+    def without_graph_optimization(self):
+        """
+        Run program using Executor other than ParallelExecutor.
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.without_graph_optimization = True
+
+        """
+        return self.strategy.without_graph_optimization
+
+    @without_graph_optimization.setter
+    @is_strict_auto
+    def without_graph_optimization(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.without_graph_optimization = flag
+        else:
+            print(
+                "WARNING: without_graph_optimization should have value of bool type"
+            )
+
     @property
     def pipeline(self):
         """
@@ -817,7 +895,7 @@ class DistributedStrategy(object):
         **Notes**:
             **Detailed arguments for pipeline_configs**
 
-            **micro_batch**: the number of small batches in each user defined batch
+            **micro_batch_size**: the number of small batches in each user defined batch
 
         Examples:
 
@@ -826,7 +904,7 @@ class DistributedStrategy(object):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
-            strategy.pipeline_configs = {"micro_batch": 12}
+            strategy.pipeline_configs = {"micro_batch_size": 12}
 
         """
 
@@ -839,6 +917,92 @@ class DistributedStrategy(object):
                           "pipeline_configs")
         assign_configs_value(self.strategy.pipeline_configs, configs)
 
+    @property
+    def tensor_parallel(self):
+        """
+        Indicating whether we are using tensor parallel for distributed training.
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+
+        """
+        return self.strategy.tensor_parallel
+
+    @tensor_parallel.setter
+    @is_strict_auto
+    def tensor_parallel(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.tensor_parallel = flag
+        else:
+            print("WARNING: tensor_parallel should have value of bool type")
+
+    @property
+    def tensor_parallel_configs(self):
+        """
+        Set tensor_parallel configurations.
+
+        **Notes**:
+            **Detailed arguments for tensor_parallel_configs**
+            **tensor_parallel_degree**: degree of tensor parallel
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4}
+
+        """
+        return get_msg_dict(self.strategy.tensor_parallel_configs)
+
+    @tensor_parallel_configs.setter
+    @is_strict_auto
+    def tensor_parallel_configs(self, configs):
+        check_configs_key(self.strategy.tensor_parallel_configs, configs,
+                          "tensor_parallel_configs")
+        assign_configs_value(self.strategy.tensor_parallel_configs, configs)
+
+    @property
+    def hybrid_configs(self):
+        """
+        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism 
+        needs to meet the following relationships
+
+        total_number_GPUs = dp_degree * mp_degree * pp_degree
+
+        **Note**:
+            dp_degree(int): set number of GPUs in a data parallel group. Default -1.
+                                    This value should be an integer greater than 0.
+                                    If it is not set, or set to -1, its value will be inferred 
+                                    based on the total number of cards.
+            mp_degree(int): set number of GPUs in a model parallel group. Default 1
+            pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
+
+
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.hybrid_configs = {
+                "dp_degree": 1,
+                "mp_degree": 2,
+                "pp_degree": 1}
+        """
+        return get_msg_dict(self.strategy.hybrid_configs)
+
+    @hybrid_configs.setter
+    def hybrid_configs(self, configs):
+        check_configs_key(self.strategy.hybrid_configs, configs,
+                          "hybrid_configs")
+        assign_configs_value(self.strategy.hybrid_configs, configs)
+
     @property
     def localsgd(self):
         """
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index f4075e92c4c4488ae2ffb8a2c14b34eeee35123b..a7564a23a7cfb825c28364cd5afc60c55f425ca7 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -26,6 +26,14 @@ from .meta_optimizer_factory import MetaOptimizerFactory
 from .runtime_factory import RuntimeFactory
 from paddle.fluid.wrapped_decorator import wrap_decorator
 from paddle.fluid.dygraph import parallel_helper
+from . import topology as tp
+from .topology import ParallelMode
+from ..meta_parallel import ModelParallel
+from ..meta_parallel import PipelineParallel
+from ..meta_optimizers import HybridParallelOptimizer
+from ..meta_optimizers import HybridParallelGradScaler
+
+__all__ = []
 
 
 def _inited_runtime_handler_(func):
@@ -218,6 +226,9 @@ class Fleet(object):
 
         if paddle.fluid.framework.in_dygraph_mode():
             if self.worker_num() == 1:
+                # if worker_num is 1, should construct default topology & hcg
+                self._topology = tp.CommunicateTopology()
+                self._hcg = tp.HybridCommunicateGroup(self._topology)
                 return
             if parallel_helper._is_parallel_ctx_initialized():
                 warnings.warn(
@@ -234,6 +245,48 @@ class Fleet(object):
                         self._user_defined_strategy.nccl_comm_num)
                 paddle.distributed.init_parallel_env()
 
+            # init hybrid parallel environment in dygraph
+            if tp._HYBRID_PARALLEL_GROUP is None:
+                self._init_hybrid_parallel_env()
+            else:
+                warnings.warn(
+                    "The dygraph hybrid parallel environment has been initialized."
+                )
+
+    def _init_hybrid_parallel_env(self):
+        """initialize the hybrid environment
+        """
+        self.hybrid_configs = self._user_defined_strategy.hybrid_configs
+        self.dp_degree = self.hybrid_configs["dp_degree"]
+        self.mp_degree = self.hybrid_configs["mp_degree"]
+        self.pp_degree = self.hybrid_configs["pp_degree"]
+
+        assert self.mp_degree >= 0, "mp_degree should be greater or equal to 0"
+        assert self.pp_degree >= 0, "pp_degree should be greater or equal to 0"
+
+        self.mp_degree = max(self.mp_degree, 1)
+        self.pp_degree = max(self.pp_degree, 1)
+
+        if self.dp_degree < 0:
+            nranks = paddle.distributed.get_world_size()
+            self.dp_degree = nranks // (self.mp_degree * self.pp_degree)
+
+        self.dp_degree = max(self.dp_degree, 1)
+
+        self._topology = tp.CommunicateTopology(
+            hybrid_group_names=["data", "pipe", "model"],
+            dims=[self.dp_degree, self.pp_degree, self.mp_degree])
+
+        self._hcg = tp.HybridCommunicateGroup(self._topology)
+
+    def get_hybrid_communicate_group(self):
+        assert self._hcg is not None
+        return self._hcg
+
+    def get_hybrid_parallel_topology(self):
+        assert self._topology is not None
+        return self._topology
+
     def is_first_worker(self):
         """
         Check whether the node is the first instance of worker.
@@ -289,6 +342,18 @@ class Fleet(object):
         """
         return self._role_maker._worker_num()
 
+    def node_num(self):
+        return self._role_maker._get_node_num()
+
+    def local_rank(self):
+        return self._role_maker._get_local_rank()
+
+    def local_device_ids(self):
+        return self._role_maker._get_local_device_ids()
+
+    def world_device_ids(self):
+        return self._role_maker._get_world_device_ids()
+
     def is_worker(self):
         """
         Check whether the node is an instance of worker.
@@ -628,15 +693,23 @@ class Fleet(object):
         self.user_defined_optimizer = optimizer
 
         if strategy is not None:
-            warnings.warn(
-                "It is recommended to use DistributedStrategy "
-                "in fleet.init(). The strategy here is only for compatibility. "
-                "If the strategy in fleet.distributed_optimizer() is "
-                "not None, then it will overwrite the DistributedStrategy in fleet.init(), "
-                "which will take effect in distributed training.")
+            if self._is_collective:
+                warnings.warn(
+                    "It is recommended to use DistributedStrategy "
+                    "in fleet.init(). The strategy here is only for compatibility. "
+                    "If the strategy in fleet.distributed_optimizer() is "
+                    "not None, then it will overwrite the DistributedStrategy in fleet.init(), "
+                    "which will take effect in distributed training.")
             self._user_defined_strategy = copy.deepcopy(strategy)
 
         self._context = {}
+
+        if paddle.fluid.framework.in_dygraph_mode():
+            if self.worker_num() > 1:
+                return HybridParallelOptimizer(optimizer, self._hcg,
+                                               self._user_defined_strategy)
+            else:
+                return optimizer
         return self
 
     @dygraph_only
@@ -695,13 +768,25 @@ class Fleet(object):
 
 
         """
-        assert model is not None
-        self.model = paddle.DataParallel(
-            model,
-            comm_buffer_size=self._user_defined_strategy.fuse_grad_size_in_MB,
-            last_comm_buffer_size=self._user_defined_strategy.
-            last_comm_group_size_MB)
-        return self.model
+        assert model is not None, "model should not be None"
+        if self.worker_num() <= 1:
+            return model
+        if self._hcg.get_parallel_mode() == ParallelMode.DATA_PARALLEL:
+            distributed_model = paddle.DataParallel(
+                model,
+                comm_buffer_size=self._user_defined_strategy.
+                fuse_grad_size_in_MB,
+                last_comm_buffer_size=self._user_defined_strategy.
+                last_comm_group_size_MB,
+                find_unused_parameters=self._user_defined_strategy.
+                find_unused_parameters)
+        elif self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL:
+            distributed_model = ModelParallel(
+                model, self._hcg, strategy=self._user_defined_strategy)
+        elif self._hcg.get_parallel_mode() == ParallelMode.PIPELINE_PARALLEL:
+            distributed_model = PipelineParallel(
+                model, self._hcg, strategy=self._user_defined_strategy)
+        return distributed_model
 
     @dygraph_only
     def state_dict(self):
@@ -963,6 +1048,28 @@ class Fleet(object):
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.clear_grad()
 
+    def _get_amp_optimizer(self):
+        # imitate target optimizer retrieval
+        amp_optimizer = None
+        for optimizer in self.strategy_compiler._get_applied_meta_optimizer():
+            if hasattr(optimizer, 'amp_init'):
+                amp_optimizer = optimizer
+                break
+
+        if amp_optimizer is None:
+            if hasattr(self.user_defined_optimizer, 'amp_init'):
+                amp_optimizer = self.user_defined_optimizer
+
+        assert amp_optimizer is not None, \
+            "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
+        return amp_optimizer
+
+    def get_loss_scaling(self):
+        """Return the real-time loss scaling factor.
+        """
+        amp_optimizer = self._get_amp_optimizer()
+        return amp_optimizer.get_loss_scaling()
+
     def amp_init(self,
                  place,
                  scope=None,
@@ -1023,21 +1130,7 @@ class Fleet(object):
                 if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                     run_example_code()       
         """
-
-        # imitate target optimizer retrieval
-        amp_optimizer = None
-        for optimizer in self.strategy_compiler._get_applied_meta_optimizer():
-            if hasattr(optimizer, 'amp_init'):
-                amp_optimizer = optimizer
-                break
-
-        if amp_optimizer is None:
-            if hasattr(self.user_defined_optimizer, 'amp_init'):
-                amp_optimizer = self.user_defined_optimizer
-
-        assert amp_optimizer is not None, \
-            "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
-
+        amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test)
 
     def _final_strategy(self):
@@ -1249,3 +1342,7 @@ class Fleet(object):
         fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
+
+    @dygraph_only
+    def distributed_scaler(self, scaler):
+        return HybridParallelGradScaler(scaler, self._hcg)
diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index f845b3fcd8953c44c8b5b857dac08be1c7269958..52eeebd0c126c241e2f0d961d6bc9138607c5181 100755
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -14,9 +14,15 @@
 
 from ..meta_optimizers import *
 
+__all__ = []
+
 meta_optimizer_names = list(
     filter(lambda name: name.endswith("Optimizer"), dir()))
 
+# Because HybridParallelOptimizer is dygraph optimizer, it 
+# should be removed
+meta_optimizer_names.remove("HybridParallelOptimizer")
+
 
 class MetaOptimizerFactory(object):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
index 6b3232b93b22416982d86d80db4530627bb2493a..c7ddd33d5d0187ba92eb5313f616e4e3396bd37c 100644
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -17,6 +17,8 @@ import socket
 from contextlib import closing
 from six import string_types
 
+__all__ = []
+
 
 def wait_server_ready(endpoints):
     """
@@ -24,7 +26,7 @@ def wait_server_ready(endpoints):
     port readiness.
     
     Args:
-    endpoints (list): endpoints string list, like:
+    endpoints (list|tuple): endpoints string list, like:
     ["127.0.0.1:8080", "127.0.0.1:8081"]
     
     Examples:
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index a8683aea97fff41480f6a8178aff6ba73dc2e998..f89d73416960a8a2d82a1155e1bc3463255a1067 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -22,6 +22,8 @@ import paddle
 import paddle.fluid as fluid
 from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 
 class Role:
     WORKER = 1
@@ -622,6 +624,29 @@ class PaddleCloudRoleMaker(RoleMakerBase):
             self._generate_role()
         return self._nodes_num
 
+    def _get_node_num(self):
+        """
+        return the training node number
+        """
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._nodes_num
+
+    def _get_local_rank(self):
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._local_rank
+
+    def _get_local_device_ids(self):
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._local_device_ids
+
+    def _get_world_device_ids(self):
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._world_device_ids
+
     def _get_trainer_endpoints(self):
         """
         get endpoint of all trainers
@@ -782,6 +807,9 @@ class PaddleCloudRoleMaker(RoleMakerBase):
         self._trainers_num = len(self._worker_endpoints)
         self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
+        self._local_rank = os.getenv("PADDLE_RANK_IN_NODE")
+        self._local_device_ids = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
+        self._world_device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
 
     def _gloo_init(self):
         # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier
diff --git a/python/paddle/distributed/fleet/base/runtime_factory.py b/python/paddle/distributed/fleet/base/runtime_factory.py
index 9e612c6d530f142d2ea9f79cb3b5fb4fcbc4a3e9..85ff3e1e69c58138e11c61dd4da7a79a0f2665d0 100644
--- a/python/paddle/distributed/fleet/base/runtime_factory.py
+++ b/python/paddle/distributed/fleet/base/runtime_factory.py
@@ -15,6 +15,8 @@ from ..runtime.collective_runtime import CollectiveRuntime
 from ..runtime.parameter_server_runtime import ParameterServerRuntime
 from ..runtime.the_one_ps import TheOnePSRuntime
 
+__all__ = []
+
 
 class RuntimeFactory(object):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index 7b146318abe62a4a3de84860193567fe5b008604..b90e5b2bff7bfaf2514a5d9f4b53620fe2a49b78 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+__all__ = []
+
 
 def create_graph(optimizer_list):
     nsize = len(optimizer_list)
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..470a4d83aac3fedb2135d44567fe31688894b093
--- /dev/null
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -0,0 +1,255 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import paddle
+import collections
+import numpy as np
+from itertools import product
+from functools import reduce
+from ..utils.log_util import logger
+
+__all__ = ['CommunicateTopology', 'HybridCommunicateGroup']
+
+_HYBRID_PARALLEL_GROUP = None
+
+
+class ParallelMode(object):
+    DATA_PARALLEL = 0
+    MODEL_PARALLEL = 1
+    PIPELINE_PARALLEL = 2
+
+
+class CommunicateTopology(object):
+    def __init__(self,
+                 hybrid_group_names=["data", "pipe", "model"],
+                 dims=[1, 1, 1]):
+        self._parallel_names = hybrid_group_names
+        self._dims = dims
+        self.coordinate = collections.namedtuple('Coordinate',
+                                                 self._parallel_names)
+        self._world_size = reduce(lambda x, y: x * y, self._dims)
+
+        ranges = [range(d) for d in self._dims]
+        all_coordinate = [self.coordinate(*x) for x in product(*ranges)]
+
+        self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
+        self._rank2coord = dict(
+            zip(self._coord2rank.values(), self._coord2rank.keys()))
+
+    def get_hybrid_group_names(self):
+        return self._parallel_names
+
+    def get_dim(self, axis_name):
+        return self._dims[self._parallel_names.index(axis_name)]
+
+    def world_size(self):
+        return self._world_size
+
+    def get_rank(self, **args):
+        assert len(args) == len(self._dims)
+        key = self.coordinate(**args)
+        assert key in self._coord2rank.keys()
+        return self._coord2rank[key]
+
+    def get_coord(self, rank):
+        assert rank < self._world_size
+        assert rank in self._rank2coord.keys()
+        return self._rank2coord[rank]
+
+    def get_axis_list(self, axis_name, index):
+        axis = self._parallel_names.index(axis_name)
+        ranks = [
+            self._coord2rank[coord] for coord in self._coord2rank.keys()
+            if coord[axis] == index
+        ]
+        ranks.sort()
+        return ranks
+
+    def get_dim_size(self, axis_name):
+        assert axis_name in self._parallel_names
+        return self._dims[self._parallel_names.index(axis_name)]
+
+    def get_comm_list(self, axis_name):
+        assert axis_name in self._parallel_names
+        other_axis_names = [
+            name for name in self._parallel_names if name != axis_name
+        ]
+
+        ranges = []
+        for name in other_axis_names:
+            dim_num = self.get_dim_size(name)
+            ranges.append(range(dim_num))
+
+        all_result = []
+        for x in product(*ranges):
+            key_coord = {}
+            for other_name in other_axis_names:
+                key_coord[other_name] = x[other_axis_names.index(other_name)]
+
+            result = []
+            for i in range(0, self.get_dim_size(axis_name)):
+                key_coord[axis_name] = i
+                result.append(self._coord2rank[self.coordinate(**key_coord)])
+            all_result.append(result)
+
+        return all_result
+
+
+class HybridCommunicateGroup(object):
+    def __init__(self, topology):
+        self.nranks = paddle.distributed.get_world_size()
+        self.global_rank = paddle.distributed.get_rank()
+        self._topo = topology
+
+        self._dp_degree = self._topo.get_dim('data')
+        self._mp_degree = self._topo.get_dim('model')
+        self._pp_degree = self._topo.get_dim('pipe')
+
+        self._data_parallel_id = self._get_data_parallel_id()
+        self._model_parallel_id = self._get_model_parallel_id()
+        self.stage_id = self._get_pipe_parallel_id()
+
+        assert self._check_vaild_topo(
+        ), "Here is an unreasonable topogy setting. world_size: {}, but" \
+            "dp_num: {}, mp_num: {}, pp_num: {}".format(self.nranks, self._dp_degree,
+            self._mp_degree, self._pp_degree)
+
+        # create comm group for data parallel
+        self._dp_group, self._dp_comm_group = self._set_comm_group("data")
+
+        # create comm group for model parallel
+        self._mp_group, self._mp_comm_group = self._set_comm_group("model")
+
+        # create comm group for pipe parallel
+        self._pp_group, self._pp_comm_group = self._set_comm_group("pipe")
+
+        # create global group for check inf_nan / clip global norm
+        self._check_group, self._check_comm_group = self._set_check_group(
+            "data")
+
+        # create p2p group
+        self.is_first_stage = (self.stage_id == 0)
+        self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
+
+        debug_str = "HybridParallelInfo: rank_id: %d, dp_degree: %d, " \
+                    "mp_degree: %d, pp_degree: %d" % (self.global_rank, self._dp_degree,
+                    self._mp_degree,self._pp_degree)
+        debug_str += ", dp_group: %s, mp_group: %s, pp_group: %s, check/clip group: %s" % (
+            self._dp_group, self._mp_group, self._pp_group, self._check_group)
+        logger.info(debug_str)
+
+        global _HYBRID_PARALLEL_GROUP
+        _HYBRID_PARALLEL_GROUP = self
+
+    def get_parallel_mode(self):
+        # there are three modes : DataParallel / ModelParallel / PipelineParallel
+        if self._mp_degree == 1 and self._pp_degree == 1:
+            return ParallelMode.DATA_PARALLEL
+        elif self._mp_degree > 1 and self._pp_degree == 1:
+            # initialize the seed
+            return ParallelMode.MODEL_PARALLEL
+        elif self._pp_degree > 1:
+            return ParallelMode.PIPELINE_PARALLEL
+
+    def _check_vaild_topo(self):
+        return self._dp_degree * self._mp_degree * self._pp_degree == self.nranks
+
+    def _set_comm_group(self, parallel_method="data"):
+        parallel_group = []
+        parallel_comm_group = None
+        parallel_groups = self._topo.get_comm_list(parallel_method)
+
+        for group in parallel_groups:
+            comm_group = paddle.distributed.new_group(ranks=group)
+            if self.global_rank in group:
+                parallel_group = group
+                parallel_comm_group = comm_group
+
+        assert len(parallel_group) > 0
+        assert parallel_comm_group is not None
+
+        return parallel_group, parallel_comm_group
+
+    def _set_check_group(self, parallel_method="data"):
+        parallel_group = []
+        parallel_comm_group = None
+        parallel_size = self._topo.get_dim(parallel_method)
+        for idx in range(parallel_size):
+            parallel_groups = self._topo.get_axis_list(parallel_method, idx)
+            comm_group = paddle.distributed.new_group(ranks=parallel_groups)
+            if self.global_rank in parallel_groups:
+                parallel_group = parallel_groups
+                parallel_comm_group = comm_group
+
+        assert len(parallel_group) > 0
+        assert parallel_comm_group is not None
+
+        return parallel_group, parallel_comm_group
+
+    def topology(self):
+        return self._topo
+
+    def get_global_rank(self):
+        return self.global_rank
+
+    # data parallel message:
+    def _get_data_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).data
+
+    def get_data_parallel_rank(self):
+        return self._data_parallel_id
+
+    def get_data_parallel_world_size(self):
+        return self._dp_degree
+
+    def get_data_parallel_group(self):
+        return self._dp_comm_group
+
+    def get_data_parallel_group_src_rank(self):
+        return self._dp_comm_group.ranks[0]
+
+    # model parallel message:
+    def _get_model_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).model
+
+    def get_model_parallel_rank(self):
+        return self._model_parallel_id
+
+    def get_model_parallel_world_size(self):
+        return self._mp_degree
+
+    def get_model_parallel_group(self):
+        return self._mp_comm_group
+
+    def get_model_parallel_group_src_rank(self):
+        return self._mp_comm_group.ranks[0]
+
+    # pipeline parallel message
+    def _get_pipe_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).pipe
+
+    def get_stage_id(self):
+        return self.stage_id
+
+    def get_pipe_parallel_world_size(self):
+        return self._pp_degree
+
+    def get_pipe_parallel_group(self):
+        return self._pp_comm_group
+
+    # check parallel group
+    def get_check_parallel_group(self):
+        return self._check_comm_group
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index d982f14eaa5af1dc3e69fce31c2b418799bfb4b4..de101cd74c4e83ee52cd9cd51f14b33bf3339396 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -27,7 +27,8 @@ from paddle.fluid import core
 import subprocess
 import os
 import numpy as np
-__all__ = ['UtilBase']
+
+__all__ = []
 
 
 class UtilFactory(object):
diff --git a/python/paddle/distributed/fleet/cloud_utils.py b/python/paddle/distributed/fleet/cloud_utils.py
index f5a24cf48ca06d1719ff1f788d1b2c06a667f541..0b1169e4422637d5f6bd6e4e54a6cba3d076f86e 100644
--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -16,6 +16,8 @@ import os
 import paddle
 from paddle.distributed.fleet.launch_utils import get_cluster, logger
 
+__all__ = []
+
 
 def get_cloud_cluster(args_node_ips,
                       device_mode,
diff --git a/python/paddle/distributed/fleet/data_generator/__init__.py b/python/paddle/distributed/fleet/data_generator/__init__.py
index 481df4064a4ecccfdfe7dc09a707b5297fabf4bc..230ada2abec06292001965aa19e2e97c60d232b2 100644
--- a/python/paddle/distributed/fleet/data_generator/__init__.py
+++ b/python/paddle/distributed/fleet/data_generator/__init__.py
@@ -11,4 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-from .data_generator import *
+from .data_generator import DataGenerator  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/data_generator/data_generator.py b/python/paddle/distributed/fleet/data_generator/data_generator.py
index 669d2ea24a0c788bbe1c0cff38a843bd96e29016..cceb81838c1d2a8c4b050406b32bfdaa0774ee79 100644
--- a/python/paddle/distributed/fleet/data_generator/data_generator.py
+++ b/python/paddle/distributed/fleet/data_generator/data_generator.py
@@ -15,6 +15,8 @@
 import os
 import sys
 
+__all__ = []
+
 
 class DataGenerator(object):
     """
@@ -32,11 +34,11 @@ class DataGenerator(object):
         '''
         Set batch size of current DataGenerator
         This is necessary only if a user wants to define generator_batch
-        
+
         Example:
 
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -52,7 +54,7 @@ class DataGenerator(object):
                                 yield ("words", s[1].extend([s[1][0]]))
                 mydata = MyData()
                 mydata.set_batch(128)
-                    
+
         '''
         self.batch_size_ = batch_size
 
@@ -63,7 +65,7 @@ class DataGenerator(object):
 
         Example:
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -100,9 +102,9 @@ class DataGenerator(object):
         generated.
 
         Example:
-        
+
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -161,7 +163,7 @@ class DataGenerator(object):
               The data format is list or tuple: 
             [(name, [feasign, ...]), ...] 
               or ((name, [feasign, ...]), ...)
-             
+
             For example:
             [("words", [1926, 08, 17]), ("label", [1])]
               or (("words", [1926, 08, 17]), ("label", [1]))
@@ -174,7 +176,7 @@ class DataGenerator(object):
         Example:
 
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -206,7 +208,7 @@ class DataGenerator(object):
         Example:
 
             .. code-block:: python
-                
+
                 import paddle.distributed.fleet.data_generator as dg
                 class MyData(dg.DataGenerator):
 
@@ -259,6 +261,9 @@ class MultiSlotStringDataGenerator(DataGenerator):
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
+        if sys.version > '3' and isinstance(line, zip):
+            line = list(line)
+
         if not isinstance(line, list) and not isinstance(line, tuple):
             raise ValueError(
                 "the output of process() must be in list or tuple type"
@@ -289,7 +294,7 @@ class MultiSlotDataGenerator(DataGenerator):
             >>> [ids_num id1 id2 ...] ...
         The proto_info will be in this format:
             >>> [(name, type), ...]
-        
+
         For example, if the input is like this:
             >>> [("words", [1926, 08, 17]), ("label", [1])]
             >>> or (("words", [1926, 08, 17]), ("label", [1]))
@@ -304,6 +309,9 @@ class MultiSlotDataGenerator(DataGenerator):
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
+        if sys.version > '3' and isinstance(line, zip):
+            line = list(line)
+
         if not isinstance(line, list) and not isinstance(line, tuple):
             raise ValueError(
                 "the output of process() must be in list or tuple type"
diff --git a/python/paddle/distributed/fleet/data_generator/test_data_generator.py b/python/paddle/distributed/fleet/data_generator/test_data_generator.py
deleted file mode 100644
index 60cbaf0bd364358de18cea5ab4297e8429581b39..0000000000000000000000000000000000000000
--- a/python/paddle/distributed/fleet/data_generator/test_data_generator.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-import paddle
-import paddle.distributed.fleet as fleet
-
-
-class SyntheticData(fleet.MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", [1, 2, 3, 4]), ("label", [0])
-
-        return data_iter
-
-
-class SyntheticStringData(fleet.MultiSlotStringDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield [("words", ["1", "2", "3", "4"]), ("label", ["0"])]
-
-        return data_iter
-
-
-sd = SyntheticData()
-sd.run_from_memory()
-
-sd2 = SyntheticStringData()
-sd2.run_from_memory()
diff --git a/python/paddle/distributed/fleet/dataset/__init__.py b/python/paddle/distributed/fleet/dataset/__init__.py
index af33c4eafb396827335157933d51f37ca8b06011..55b944abccd51ca3427060ec610299f41e01e82c 100644
--- a/python/paddle/distributed/fleet/dataset/__init__.py
+++ b/python/paddle/distributed/fleet/dataset/__init__.py
@@ -11,4 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-from .dataset import *
+from .dataset import DatasetBase  # noqa: F401
+from .dataset import InMemoryDataset  # noqa: F401
+from .dataset import QueueDataset  # noqa: F401
+from .dataset import FileInstantDataset  # noqa: F401
+from .dataset import BoxPSDataset  # noqa: F401
+from .index_dataset import TreeIndex  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index 10c27ea91d249428e889701104f2fc8d837bbac9..dc41e3589812f3c6de10e4204cddb4ee723cbf88 100644
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -18,6 +18,8 @@ from paddle.fluid.proto import data_feed_pb2
 from google.protobuf import text_format
 import paddle.fluid.core as core
 
+__all__ = []
+
 
 class DatasetBase(object):
     """ Base dataset class. """
@@ -31,6 +33,7 @@ class DatasetBase(object):
         self.dataset = core.Dataset("MultiSlotDataset")
         self.thread_num = 1
         self.filelist = []
+        self.use_ps_gpu = False
 
     def init(self,
              batch_size=1,
@@ -212,6 +215,15 @@ class DatasetBase(object):
         self.dataset.set_data_feed_desc(self._desc())
         self.dataset.create_readers()
 
+    def _set_use_ps_gpu(self, use_ps_gpu):
+        """
+        set use_ps_gpu flag
+
+        Args:
+            use_ps_gpu: bool
+        """
+        self.use_ps_gpu = use_ps_gpu
+
     def _finish_to_run(self):
         self.dataset.destroy_readers()
 
@@ -529,12 +541,18 @@ class InMemoryDataset(DatasetBase):
 
     def _dynamic_adjust_before_train(self, thread_num):
         if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(thread_num, False)
+            if self.use_ps_gpu:
+                self.dataset.dynamic_adjust_channel_num(thread_num, True)
+            else:
+                self.dataset.dynamic_adjust_channel_num(thread_num, False)
         self.dataset.dynamic_adjust_readers_num(thread_num)
 
     def _dynamic_adjust_after_train(self):
         if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
+            if self.use_ps_gpu:
+                self.dataset.dynamic_adjust_channel_num(self.thread_num, True)
+            else:
+                self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
         self.dataset.dynamic_adjust_readers_num(self.thread_num)
 
     def _set_queue_num(self, queue_num):
diff --git a/python/paddle/distributed/fleet/dataset/index_dataset.py b/python/paddle/distributed/fleet/dataset/index_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4c424fe2dc7e6eac8405adc59397c869d85b86c
--- /dev/null
+++ b/python/paddle/distributed/fleet/dataset/index_dataset.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.fluid import core
+
+__all__ = []
+
+
+class Index(object):
+    def __init__(self, name):
+        self._name = name
+
+
+class TreeIndex(Index):
+    def __init__(self, name, path):
+        super(TreeIndex, self).__init__(name)
+        self._wrapper = core.IndexWrapper()
+        self._wrapper.insert_tree_index(name, path)
+        self._tree = self._wrapper.get_tree_index(name)
+        self._height = self._tree.height()
+        self._branch = self._tree.branch()
+        self._total_node_nums = self._tree.total_node_nums()
+        self._emb_size = self._tree.emb_size()
+        self._layerwise_sampler = None
+
+    def height(self):
+        return self._height
+
+    def branch(self):
+        return self._branch
+
+    def total_node_nums(self):
+        return self._total_node_nums
+
+    def emb_size(self):
+        return self._emb_size
+
+    def get_all_leafs(self):
+        return self._tree.get_all_leafs()
+
+    def get_nodes(self, codes):
+        return self._tree.get_nodes(codes)
+
+    def get_layer_codes(self, level):
+        return self._tree.get_layer_codes(level)
+
+    def get_travel_codes(self, id, start_level=0):
+        return self._tree.get_travel_codes(id, start_level)
+
+    def get_ancestor_codes(self, ids, level):
+        return self._tree.get_ancestor_codes(ids, level)
+
+    def get_children_codes(self, ancestor, level):
+        return self._tree.get_children_codes(ancestor, level)
+
+    def get_travel_path(self, child, ancestor):
+        res = []
+        while (child > ancestor):
+            res.append(child)
+            child = int((child - 1) / self._branch)
+        return res
+
+    def get_pi_relation(self, ids, level):
+        codes = self.get_ancestor_codes(ids, level)
+        return dict(zip(ids, codes))
+
+    def init_layerwise_sampler(self,
+                               layer_sample_counts,
+                               start_sample_layer=1,
+                               seed=0):
+        assert self._layerwise_sampler is None
+        self._layerwise_sampler = core.IndexSampler("by_layerwise", self._name)
+        self._layerwise_sampler.init_layerwise_conf(layer_sample_counts,
+                                                    start_sample_layer, seed)
+
+    def layerwise_sample(self, user_input, index_input, with_hierarchy=False):
+        if self._layerwise_sampler is None:
+            raise ValueError("please init layerwise_sampler first.")
+        return self._layerwise_sampler.sample(user_input, index_input,
+                                              with_hierarchy)
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 0f9b13d8a1271ff12e9e7ad72482162f38c89e94..25b10133191788cd85085fd4612ae7cea0f122f3 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -73,6 +73,9 @@ from paddle.distributed.fleet import launch_utils
 # TODO(danleifeng): Don't import * from a module
 from paddle.distributed.fleet.launch_utils import *
 import paddle.distributed.fleet.cloud_utils as cloud_utils
+import paddle.distributed.fleet.ascend_utils as ascend_utils
+
+__all__ = []
 
 
 def _print_arguments(args):
@@ -108,6 +111,12 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra
         "In gpu training, it should be less or equal to the gpus number of you system(or you set by --gpus). And so each process can"
         " bound to one or average number of gpus.")
 
+    base_group.add_argument(
+        "--run_mode",
+        type=str,
+        default=None,
+        help="run mode of job, can be:collective/ps/ps-heter")
+
     if fluid.core.is_compiled_with_cuda():
         base_group.add_argument(
             "--gpus",
@@ -222,6 +231,12 @@ def launch_collective(args):
         cluster, pod = cloud_utils.get_cloud_cluster(
             args.ips, device_mode, devices_per_proc, start_port)
         logger.debug("get cluster from cloud:{}".format(cluster))
+    elif device_mode == DeviceMode.ASCEND_NPU:
+        # for ascend
+        cluster, pod = ascend_utils.get_cloud_cluster(
+            rank_table_file=os.getenv("RANK_TABLE_FILE", None),
+            device_mode=device_mode,
+            start_port=start_port)
     else:
         # trainers_num = 1 or not use paddlecloud ips="a,b"
         cluster, pod = get_cluster_from_args(args, device_mode,
@@ -243,6 +258,9 @@ def launch_collective(args):
         log_dir=args.log_dir,
         envs=global_envs)
 
+    for idx, proc in enumerate(procs):
+        print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx))
+
     while True:
         alive = watch_local_trainers(procs, cluster.trainers_nranks())
 
@@ -276,6 +294,16 @@ def launch_ps(args, distribute_mode):
 
 
 def which_distributed_mode(args):
+    if args.run_mode is not None:
+        assert args.run_mode in ["collective", "ps", "ps-heter"]
+
+    if args.run_mode == "collective":
+        return DistributeMode.COLLECTIVE
+    elif args.run_mode == "ps":
+        return DistributeMode.PS
+    elif args.run_mode == "ps-heter":
+        return DistributeMode.PS_HETER
+
     ps_args = [
         '--worker_num', '--server_num', '--heter_worker_num', '--servers',
         '--workers', '--heter_workers', '--http_port'
@@ -298,24 +326,26 @@ def which_distributed_mode(args):
         )
 
     if fluid.core.is_compiled_with_cuda():
-        device_count = fluid.core.get_cuda_device_count()
+        accelerators = fluid.core.get_cuda_device_count()
+    elif fluid.core.is_compiled_with_npu():
+        accelerators = fluid.core.get_npu_device_count()
     elif fluid.core.is_compiled_with_xpu():
-        device_count = fluid.core.get_xpu_device_count()
+        accelerators = fluid.core.get_xpu_device_count()
     else:
-        device_count = 0
+        accelerators = 0
 
     if len(has_ps_args) > 0:
         logger.info(
-            "Run parameter-sever mode. pserver arguments:{}, cuda or xpu count:{}".
-            format(has_ps_args, device_count))
+            "Run parameter-sever mode. pserver arguments:{}, accelerators count:{}".
+            format(has_ps_args, accelerators))
         has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args))
         if len(has_ps_heter_args) > 0:
             return DistributeMode.PS_HETER
         else:
             return DistributeMode.PS
     elif len(has_collective_args) > 0:
-        logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
-                    format(has_collective_args, device_count))
+        logger.info("Run collective mode. gpu arguments:{}, cuda count:{}".
+                    format(has_collective_args, accelerators))
         return DistributeMode.COLLECTIVE
     else:
         if not fluid.core.is_compiled_with_cuda(
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index c5cb1ec94ac3d0e2c4931c05abcdfa93451cf7c2..be7ad257ccb99ccc1775c0a73cfbe8443b8454cf 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -52,6 +52,8 @@ class DeviceMode():
     GPU = 1
     KUNLUN = 2
     XPU = 2
+    ASCEND_NPU = 3
+    UNKNOWN = 3
 
 
 class Cluster(object):
@@ -98,6 +100,14 @@ class Cluster(object):
                 r.append(t.endpoint)
         return r
 
+    def world_device_ids(self):
+        r = []
+        for pod in self.pods:
+            for t in pod.trainers:
+                str_accelerators = [str(acc) for acc in t.accelerators]
+                r.append(str_accelerators)
+        return r
+
     def pods_endpoints(self):
         r = []
         for pod in self.pods:
@@ -105,7 +115,6 @@ class Cluster(object):
             assert pod.port != None and pod.addr != None, "{} not a valid endpoint".format(
                 ep)
             r.append(ep)
-
         return r
 
     def get_pod_by_id(self, pod_id):
@@ -132,23 +141,23 @@ class JobServer(object):
 
 class Trainer(object):
     def __init__(self):
-        self.gpus = []
+        self.accelerators = []
         self.endpoint = None
         self.rank = None
 
     def __str__(self):
-        return "gpu:{} endpoint:{} rank:{}".format(self.gpus, self.endpoint,
-                                                   self.rank)
+        return "accelerator:{} endpoint:{} rank:{}".format(
+            self.accelerators, self.endpoint, self.rank)
 
     def __eq__(self, t):
-        if len(self.gpus) != len(t.gpus):
+        if len(self.accelerators) != len(t.accelerators):
             return False
 
         if self.endpoint != t.endpoint or \
                 self.rank != t.rank:
             return False
 
-        for a, b in zip(self.gpus, t.gpus):
+        for a, b in zip(self.accelerators, t.accelerators):
             if a != b:
                 return False
 
@@ -171,12 +180,13 @@ class Pod(object):
         self.servers = []
         self.workers = []
         self.heter_workers = []
-        self.gpus = []
+        self.accelerators = []
+        self.device_mode = None
 
     def __str__(self):
-        return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \
+        return "rank:{} id:{} addr:{} port:{} visible_accelerator:{} trainers:{} servers:{} \
             workers:{} heter_workers:{}".format(
-            self.rank, self.id, self.addr, self.port, self.gpus, [
+            self.rank, self.id, self.addr, self.port, self.accelerators, [
                 str(t) for t in self.trainers
             ], [str(s) for s in self.servers], [str(w) for w in self.workers],
             [str(h) for h in self.heter_workers])
@@ -231,12 +241,12 @@ class Pod(object):
     def rank(self):
         return self.rank
 
-    def get_visible_gpus(self):
+    def get_visible_accelerators(self):
         r = ""
-        for g in self.gpus:
+        for g in self.accelerators:
             r += "{},".format(g)
 
-        assert r != "", "this pod {} can't see any gpus".format(self)
+        assert r != "", "this pod {} can't see any accelerators".format(self)
 
         r = r[:-1]
         return r
@@ -264,23 +274,27 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
         pod = Pod()
         pod.rank = node_rank
         pod.addr = ip
+        pod.device_mode = device_mode
+
         cur_node_endpoints = trainer_endpoints[node_rank]
         # when use paddlecloud, endpoints may > devices_per_proc(user_defined)
         assert len(cur_node_endpoints) >= len(
             devices_per_proc
-        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
+        ), "current trainer_endpoints size should be greater equal than acclerators size."
         for i in range(len(devices_per_proc)):
             trainer = Trainer()
-            if device_mode == DeviceMode.GPU:
+            if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU:
                 if isinstance(devices_per_proc[i], (list, tuple)):
-                    trainer.gpus.extend(devices_per_proc[i])
+                    trainer.accelerators.extend(devices_per_proc[i])
+                    pod.accelerators.extend(devices_per_proc[i])
                 else:
-                    trainer.gpus.append(devices_per_proc[i])
+                    trainer.accelerators.append(devices_per_proc[i])
+                    pod.accelerators.append(devices_per_proc[i])
             elif device_mode == DeviceMode.XPU:
                 if isinstance(devices_per_proc[i], (list, tuple)):
-                    trainer.gpus.extend(devices_per_proc[i])
+                    trainer.accelerators.extend(devices_per_proc[i])
                 else:
-                    trainer.gpus.append(devices_per_proc[i])
+                    trainer.accelerators.append(devices_per_proc[i])
             trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = trainer_rank
             trainer_rank += 1
@@ -451,21 +465,37 @@ def start_local_trainers(cluster,
     current_env.pop("http_proxy", None)
     current_env.pop("https_proxy", None)
 
+    ids = cluster.world_device_ids()
+    res = [':'.join(ele) for ele in ids]
     procs = []
     for idx, t in enumerate(pod.trainers):
         proc_env = {
             "PADDLE_TRAINER_ID": "%d" % t.rank,
             "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
+            "PADDLE_RANK_IN_NODE": str(idx),
+            "PADDLE_LOCAL_DEVICE_IDS":
+            ",".join([str(acc) for acc in t.accelerators]),
+            "PADDLE_WORLD_DEVICE_IDS": ",".join(res),
         }
 
-        if fluid.core.is_compiled_with_cuda() and len(t.gpus) > 0:
+        if len(t.accelerators) > 0 and pod.device_mode == DeviceMode.GPU:
             proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
-                [str(g) for g in t.gpus])
-        elif fluid.core.is_compiled_with_xpu() and len(t.gpus) > 0:
+                [str(g) for g in t.accelerators])
+
+        elif len(t.
+                 accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU:
+            proc_env["FLAGS_selected_npus"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
+
+        if len(t.accelerators) > 0:
+            proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
+        # to do: same code style in future
+        if fluid.core.is_compiled_with_xpu() and len(t.accelerators) > 0:
             proc_env["FLAGS_selected_xpus"] = "%s" % ",".join(
-                [str(g) for g in t.gpus])
+                [str(g) for g in t.accelerators])
 
         current_env.update(proc_env)
 
@@ -623,11 +653,17 @@ def get_xpus(xpus):
 
 
 def get_device_mode():
-    if fluid.core.is_compiled_with_cuda() and fluid.core.get_cuda_device_count(
-    ) > 0:
-        print("launch train in GPU mode")
+    if fluid.core.is_compiled_with_npu() and \
+            fluid.core.get_npu_device_count() > 0:
+        print("launch train in ascend npu mode!")
+        return DeviceMode.ASCEND_NPU
+
+    if fluid.core.is_compiled_with_cuda() and \
+            fluid.core.get_cuda_device_count() > 0:
+        print("launch train in GPU mode!")
         return DeviceMode.GPU
-    elif fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count(
+
+    if fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count(
     ) > 0:
         print("launch train in XPU mode")
         return DeviceMode.XPU
@@ -654,6 +690,8 @@ def get_device_proc_info(args):
             ]
         else:
             devices_per_proc = gpus
+    elif device_mode == DeviceMode.ASCEND_NPU:
+        devices_per_proc = None
     elif device_mode == DeviceMode.XPU:
         xpus = get_xpus(args.xpus)
         if args.nproc_per_node is not None:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index cdc8162f6dee54db24007b4485706b57545aea54..1788e044fe885abe275e60f964db39a9f46ba325 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -25,3 +25,7 @@ from .dgc_optimizer import DGCOptimizer
 from .lamb_optimizer import LambOptimizer
 from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
 from .sharding_optimizer import ShardingOptimizer
+from .dygraph_optimizer import HybridParallelOptimizer
+from .dygraph_optimizer import HybridParallelGradScaler
+from .tensor_parallel_optimizer import TensorParallelOptimizer
+from .raw_program_optimizer import RawProgramOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
old mode 100644
new mode 100755
index dba3c944f70ab8e434d75269e5a3876e1fa49461..9ffb47789ee987ef6b53d2170d5b2c7692335dc2
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -14,6 +14,8 @@
 import paddle.fluid.contrib.mixed_precision as mixed_precision
 from .meta_optimizer_base import MetaOptimizerBase
 
+__all__ = []
+
 
 class AMPOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
@@ -59,6 +61,7 @@ class AMPOptimizer(MetaOptimizerBase):
         is_distributed = self.role_maker._worker_num() > 1
         if self.user_defined_strategy.sharding:
             # FIXME(wangxi). sharding failed when split check_finite_and_unscale
+            # FIXME(JZ-LIANG). To support Sharding-Megatron-AMP, Megatron should follow Sharding's behavior that to disable is_distributed.
             is_distributed = False
         self.wrapped_opt._set_distributed(is_distributed)
 
diff --git a/paddle/fluid/train/demo/clean.sh b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
old mode 100755
new mode 100644
similarity index 81%
rename from paddle/fluid/train/demo/clean.sh
rename to python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
index a2064492c08b842ba8779823d0c2631bd8f8dbe5..b9a7651e4490963cf1e5817d1e292f6b89b43ddf
--- a/paddle/fluid/train/demo/clean.sh
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
@@ -1,6 +1,4 @@
-#!/bin/bash
-
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,8 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-set -x
-cd "$(dirname "$0")"
-rm -rf build/
-set +x
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
index d7ac81bb5c584ad72da711e95e7c89fc609d058b..6282ac7b50983861a1b8fd0bd768f9ccfdd96808 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
@@ -12,16 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Optimizer
 import paddle.fluid.core as core
 import numpy as np
-import ascend_parser
+from . import ascend_parser
+from paddle.distributed import fleet
+import hccl.manage.api as hccl
+from collections import namedtuple
+
+HcomGroupConfig = namedtuple('HcomGroupConfig', ['name', 'nranks', 'rank_ids'])
+
+__all__ = []
 
 
 class AscendIRParser(object):
-    def __init__(self):
+    def __init__(self, auto_dp=False, world_rank_size=1):
         self.graph_idx = 0
+        self.hcom_endpoints = {}
+        self.groups_to_create = []
+        self._auto_dp = auto_dp
+        self._world_rank_size = world_rank_size
 
     def _construct_input_map(self, input_varlist):
         ret_map = {}
@@ -43,15 +55,52 @@ class AscendIRParser(object):
                 ret_map[var.name] = ge_input
         return ge_in_operator, ret_map
 
+    def _endpoint_to_world_rank_id(self, endpoint):
+        world_endpoints = fleet.worker_endpoints()
+        assert endpoint in world_endpoints, "endpoint (%s) not in worker_endpoints (%s) " % (
+            endpoint, fleet.world_device_ids())
+        return world_endpoints.index(endpoint)
+
     def parse_op(self, op):
-        if op.type in ascend_parser.registerd_op:
-            print("Op[%s] has been registered, begin to parse it" % (op.type))
+        if op.type == 'c_gen_nccl_id':
+            endpoint = op.attr("endpoint")
+            other_endpoints = op.attr("other_endpoints")
+            rank = op.attr("rank")
+
+            nccl_id = op.output_arg_names[0]
+
+            # c_gen_nccl_id operator splits endpoints into local endpoint and other_endpoints
+            # we should combine these together to produce world_rank_ids 
+            self.hcom_endpoints[nccl_id] = other_endpoints[:]
+            self.hcom_endpoints[nccl_id].insert(rank, endpoint)
+
+            print("nccl_id (%s) registered endpoints %s" %
+                  (nccl_id, self.hcom_endpoints[nccl_id]))
+        elif op.type == 'c_comm_init':
+            nccl_id = op.input_arg_names[0]
+            nranks = op.attr("nranks")
+            assert nranks == len(self.hcom_endpoints[
+                nccl_id]), "nranks doesn't match endpoint count"
+            rank = op.attr("rank")
+            ring_id = op.attr("ring_id")
+
+            group_name = "hcom_group_" + str(ring_id)
+            global_rank_ids = [
+                self._endpoint_to_world_rank_id(endpoint)
+                for endpoint in self.hcom_endpoints[nccl_id]
+            ]
+            self.groups_to_create.append(
+                HcomGroupConfig(
+                    name=group_name, nranks=nranks, rank_ids=global_rank_ids))
+            print("append to create group: %s, with rank_ids: %s" %
+                  (group_name, global_rank_ids))
+        elif op.type in ascend_parser.registerd_op:
             op_parser = self.parser_factory.create_parse(
                 ascend_parser.registerd_op[op.type])
             op_parser.apply(op)
         else:
-            print("Op[%s] has not been registered, so we have to skip it" %
-                  (op.type))
+            assert False, "Op[%s] has not been registered, so we have to skip it" % (
+                op.type)
 
     def _parse_program(self,
                        graph_name,
@@ -84,7 +133,7 @@ class AscendIRParser(object):
                 name = e.name
             ge_out_operator.append(self.var2geop[name])
 
-        # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as: 
+        # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as:
         # if graph_name == "main":
         #     ge_out_operator.append(self.var2geop["reduce_sum_0.tmp_0@GRAD"])
 
@@ -115,6 +164,17 @@ class AscendIRParser(object):
         startup_graph = self._parse_program("startup", startup_program)
         main_graph = self._parse_program("main", main_program, input_varlist,
                                          fetch_list)
+        if self._auto_dp and self._world_rank_size > 1:
+            assert len(self.groups_to_create
+                       ) == 0, "can't parse program under auto_dp mode"
+
+            from paddle.distributed import fleet
+            self.groups_to_create.append(
+                HcomGroupConfig(
+                    name="hcom_group_0",
+                    nranks=fleet.world_size(),
+                    rank_ids=[x for x in range(fleet.world_size())]))
+
         return startup_graph, main_graph
 
 
@@ -124,9 +184,14 @@ class AscendOptimizer(Optimizer):
     def __init__(self, optimizer, fetch_list=[]):
         self.inner_opt = optimizer
         self.fetch_list = fetch_list
+        self.ascend_instance = None
 
     def __del__(self):
+        print("begin AscendOptimizer del")
+        if self.ascend_instance is not None:
+            self.ascend_instance.destroy_global_resources()
         core.ge_finalize()
+        print("end AscendOptimizer del")
 
     def _can_apply(self):
         if not self.user_defined_strategy.ascend:
@@ -138,7 +203,7 @@ class AscendOptimizer(Optimizer):
         dist_strategy.ascend = False
         dist_strategy.ascend_configs = {}
 
-    def _get_input_varlist(program):
+    def _get_input_varlist(self, program):
         ret_list = []
         for var in program.list_vars():
             if var.is_data or var.persistable:
@@ -149,30 +214,57 @@ class AscendOptimizer(Optimizer):
                  loss,
                  startup_program=None,
                  parameter_list=None,
-                 no_grad_set=None):
-        minimized = self.inner_opt.minimize(
-            loss, startup_program=startup_program)
+                 no_grad_set=None,
+                 auto_dp=False,
+                 rank_table_file=None,
+                 precision_mode="must_keep_origin_dtype"):
+        minimized = None
+        if self.inner_opt:
+            minimized = self.inner_opt.minimize(
+                loss, startup_program=startup_program)
 
         self.ascend_instance = core.AscendInstance()
 
+        from paddle.distributed import fleet
+        if auto_dp and fleet.world_size() > 1:
+            from paddle.fluid.transpiler import ascend_transpiler
+            t = ascend_transpiler.AscendTranspiler(startup_program,
+                                                   loss.block.program)
+            t.transpile()
+            #print(loss.block.program)
+
         # Config about Graph Engine can be found in https://support.huaweicloud.com/
         config = {
-            "ge.exec.deviceId": "0",
+            "ge.exec.deviceId": str(fleet.local_device_ids()),
             "ge.graphRunMode": "1",
-            "ge.exec.precision_mode": "must_keep_origin_dtype"
+            "ge.exec.precision_mode": precision_mode,
         }
+        # if multi trainers
+        if rank_table_file and fleet.world_size() > 1:
+            config["ge.exec.rankTableFile"] = rank_table_file
+            config["ge.exec.rankId"] = str(fleet.worker_index())
+            config["ge.exec.isUseHcom"] = "1"
+            config["ge.exec.deployMode"] = "0"
+        print("ge_initialize config:", config)
         core.ge_initialize(config)
 
         # Init Session
         self.ascend_instance.init_global_resources()
 
         main_block = loss.block
-        self.parser = AscendIRParser()
+        self.parser = AscendIRParser(
+            auto_dp=auto_dp, world_rank_size=fleet.world_size())
+
+        input_varlist = self._get_input_varlist(main_block.program)
 
-        input_varlist = _get_input_varlist(main_block.program)
         startup_graph, main_graph = self.parser.parse_program(
             startup_program, main_block.program, input_varlist, self.fetch_list)
 
+        for cfg in self.parser.groups_to_create:
+            print("create group (%s), nranks: %d, rank_ids: %s" %
+                  (cfg.name, cfg.nranks, cfg.rank_ids))
+            hccl.create_group(cfg.name, cfg.nranks, cfg.rank_ids)
+
         self.ascend_instance.add_ascend_subgraph(0, startup_graph)
         self.ascend_instance.add_ascend_subgraph(1, main_graph)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
index 2c5930c5b9f2fcc8b877d8281ec27891cbf07864..3331a45b3d9479dd1d1bc2208e689c7aad4a4f90 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -1,41 +1,108 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Optimizer
 import paddle.fluid.core as core
 import numpy as np
-
-registerd_op = {
-    "elementwise_add": "AddParser",
-    "matmul": "MatMulParser",
-    "mul": "MulParser",
-    "relu": "ReluParser",
-    "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser",
-    "shape": "ShapeParser",
-    "fill_constant": "FillConstantParser",
-    "reduce_sum": "ReduceSumParser",
-    "reduce_sum_grad": "ReduceSumGradParser",
-    "matmul_grad": "MatMulGradParser",
-    "mul_grad": "MulGradParser",
-    "reshape2": "ReshapeParser",
-    "scale": "ScaleParser",
-    "relu_grad": "ReluGradParser",
-    "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser",
-    "truncated_gaussian_random": "TruncatedNormalParser",
-    "sgd": "SGDParser"
-}
+from paddle.distributed import fleet
+from functools import reduce
+
+__all__ = []
+
+registerd_op = {## forwards
+                "elementwise_add": "AddParser",
+                "matmul": "MatMulParser",
+                "mul": "MulParser",
+                "relu": "ReluParser",
+                "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser",
+                "shape": "ShapeParser",
+                "fill_constant": "FillConstantParser",
+                "reduce_sum": "ReduceSumParser",
+                "elementwise_mul": "DotMulParser",
+                "elementwise_div": "DotDivParser",
+                "elementwise_pow": "DotPowParser",
+                "elementwise_max": "MaxParser",
+                "elementwise_min": "MinParser",
+                "elementwise_sub": "DotSubParser",
+                "pow": "PowParser",
+                "gelu": "GeluParser",
+                "sqrt": "SqrtParser",
+                "log": "LogParser",
+                "sum": "SumParser",
+                "logical_not": "LogicalNotParser",
+                "gather": "GatherParser",
+                "scatter": "ScatterParser",
+                "cast": "CastParser",
+                "tanh": "TanhParser",
+                "stack": "StackParser",
+                "square": "SquareParser",
+                "unsqueeze2": "UnSqueezeParser",
+                "assign": "AssignParser",
+                "softmax": "SoftMaxParser",
+                "reshape2": "ReshapeParser",
+                "transpose2": "TransposeParser",
+                "layer_norm": "LayerNormParser",
+                "less_than": "LessParser",
+                "mean": "MeanParser",
+                "scale": "ScaleParser",
+                "slice": "SliceParser",
+                "top_k": "TopkParser",
+                "accuracy": "AccuracyParser",
+                #"increment": "IncrementParser",
+                "lookup_table": "LookupTableParser",
+                "truncated_gaussian_random": "TruncatedNormalParser",
+                "c_allgather": "AllGatherParser",
+                "c_allreduce_sum": "AllReduceSumParser",
+                "c_allreduce_max": "AllReduceMaxParser",
+                "c_broadcast": "BroadcastParser",
+                "c_reduce_scatter": "ReduceScatterParser",
+                "c_send": "SendParser",
+                "c_receive": "ReceiveParser",
+                "uniform_random": "UniformRandomParser",
+                "range": "RangeParser",
+                "equal": "EqualParser",
+                "expand": "ExpandParser",
+                "squeeze2": "SqueezeParser",
+
+
+                ## backwords
+                "matmul_grad": "MatMulGradParser",
+                "mul_grad": "MulGradParser",
+                "relu_grad": "ReluGradParser",
+                "reduce_sum_grad": "ReduceSumGradParser",
+                "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser",
+                "tanh_grad":"TanhGradParser",
+                "log_grad":"LogGradParser",
+                "pow_grad": "PowGradParser",
+                "sqrt_grad": "SqrtGradParser",
+                "gelu_grad": "GeluGradParser",
+                "mean_grad": "MeanGradParser",
+                'lookup_table_grad': "LookUpTableGradParser",
+                "elementwise_mul_grad": "DotMulGradParser",
+                "elementwise_add_grad": "DotAddGradParser",
+                "elementwise_div_grad": "DotDivGradParser",
+                "softmax_grad": "SoftmaxGradParser",
+                "slice_grad": "SliceGradParser",
+                "reshape2_grad": "ReshapeGradParser",
+                "gather_grad": "GatherGradParser",
+                "transpose2_grad": "TransposeGradParser",
+                "layer_norm_grad": "LayerNormGradParser",
+
+                ## opt
+                "sgd": "SGDParser",
+                #"adam": "AdamParser",
+                }
 global_cnt = -1
 global_input_cnt = -1
 
@@ -60,6 +127,7 @@ class AscendHelper(object):
             5: "float32",
             6: "float64"
         }
+        self.dtype2paddle_inv_map = {"VarType.FP32": 0, "VarType.FP16": 1}
 
     def dtype2ge(self, dtype):
         assert dtype in self.dtype2ge_map, "dtype[%d] is not supported %d" % (
@@ -105,7 +173,6 @@ class AscendParserBase(object):
             self.parser_name, len(index_list), output_num)
         for output_id in range(output_num):
             arguments = self.op.output(self.op.output_names[output_id])
-            print("%d argument:  %s" % (output_id, str(arguments)))
             if len(arguments) > 0:
                 assert len(arguments) == len(
                     index_list[output_id]
@@ -113,8 +180,6 @@ class AscendParserBase(object):
                     self.parser_name, output_id, len(index_list[output_id]),
                     len(arguments))
                 for i in range(len(arguments)):
-                    print("assgin index_list[%d][%d] to %s" %
-                          (output_id, i, arguments[i]))
                     self.var2geop[arguments[i]] = geop_list[index_list[
                         output_id][i]]
 
@@ -125,7 +190,7 @@ class AscendParserBase(object):
         self.op = op
         assert self.op.type == self.parser_name, "op [%s] != parser_name[%s]" % (
             self.op.type, self.parser_name)
-        print("begin to parse op %s" % (self.parser_name))
+        #print("begin to parse op %s" % (self.parser_name))
         geop_list, index_list = self._apply()
         self.update_output(geop_list, index_list)
 
@@ -137,7 +202,8 @@ class AscendParserBase(object):
     def _accumulated_op_id(self):
         global global_cnt
         global_cnt += 1
-        return "." + str(global_cnt)
+        name = "." + str(global_cnt)
+        return name
 
     def _create_ge_tensor(self, shape, dtype, value):
         tensor_desc = core.GETensorDesc(
@@ -152,6 +218,63 @@ class AscendParserBase(object):
         tensor.set_data(data_8)
         return tensor
 
+    def _get_ge_tensor(self, shape, dtype, value_list):
+        tensor_desc = core.GETensorDesc(
+            core.GEShape(shape), core.GEFormat.FORMAT_ND,
+            self.ascend_helper.dtype2ge(dtype))
+        tensor = core.GETensor(tensor_desc)
+
+        data = np.array(value_list).reshape(shape).astype(
+            self.ascend_helper.dtype2np(dtype))
+        buf = data.tobytes()
+        data_8 = np.frombuffer(buf, dtype=np.uint8)
+        tensor.set_data(data_8)
+
+        tensor_const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+
+        return tensor_const
+
+    def _get_variable(self, shape, dtype, tensor):
+        if dtype == "int32":
+            type = core.GEDataType.DT_INT32
+        elif dtype == "float32":
+            type = core.GEDataType.DT_FLOAT
+
+        var = core.GEOperatorFactory.create_operator(
+            "variable" + self._accumulated_op_id(), "Variable")
+        var.update_output_desc("y",
+                               core.GETensorDesc(
+                                   core.GEShape(shape), core.GEFormat.FORMAT_ND,
+                                   type))
+        assign = core.GEOperatorFactory.create_operator(
+            "assign" + self._accumulated_op_id(), "Assign").set_input(
+                "value", tensor).set_input("ref", var)
+
+        return assign
+
+    def _create_shape_tensor(self):
+        tensor_desc = core.GETensorDesc(
+            core.GEShape([2]), core.GEFormat.FORMAT_ND,
+            core.GEDataType.DT_INT32)
+        tensor = core.GETensor(tensor_desc)
+
+        data = np.ones((2)).astype("int32").reshape([2])
+        data[0] = 64
+        buf = data.tobytes()
+        data_8 = np.frombuffer(buf, dtype=np.uint8)
+        tensor.set_data(data_8)
+        return tensor
+
+    def _get_GEtensor_shape(self, tensor):
+        tensor_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", tensor)
+        tensor_shape = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", tensor_shape).set_attr_int32("dst_type", 0)
+        return tensor_shape
+
 
 class AddParser(AscendParserBase):
     def __init__(self, graph, var2geop):
@@ -162,109 +285,276 @@ class AddParser(AscendParserBase):
         x = self._get_ge_input(self.op.input_arg_names[0])
         y = self._get_ge_input(self.op.input_arg_names[1])
         add = core.GEOperatorFactory.create_operator(
-            "add" + self._accumulated_op_id(), "Add").set_input(
-                "x1", x).set_input("x2", y)
+            "add" + self._accumulated_op_id(),
+            "Add").set_input("x1", x).set_input("x2", y)
         return [add], [[0]]
 
 
-class ReduceSumParser(AscendParserBase):
+class DotSubParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ReduceSumParser, self).__init__(graph, var2geop)
-        self.parser_name = "reduce_sum"
+        super(DotSubParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_sub"
 
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
-        axes = self.op.attr("dim")
-        keep_dims = self.op.attr("keep_dim")
-        reduce_sum = core.GEOperatorFactory.create_operator(
-            "reduce_sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
-                "x", x, 0).set_attr_vec_int32("axes", axes).set_attr_bool(
-                    "keep_dims", keep_dims)
-        return [reduce_sum], [[0]]
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        sub = core.GEOperatorFactory.create_operator(
+            "sub" + self._accumulated_op_id(),
+            "Sub").set_input("x1", x).set_input("x2", y)
+        return [sub], [[0]]
 
 
-class ReduceSumGradParser(AscendParserBase):
+class DotMulParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ReduceSumGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "reduce_sum_grad"
+        super(DotMulParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_mul"
 
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
-        input = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        mul = core.GEOperatorFactory.create_operator(
+            "dotmul" + self._accumulated_op_id(),
+            "Mul").set_input("x1", x).set_input("x2", y)
+        return [mul], [[0]]
 
-        shape_tensor = core.GEOperatorFactory.create_operator(
-            "shape" + self._accumulated_op_id(), "Shape").set_input("x", input,
-                                                                    0)
-        axis_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", self._create_ge_tensor([1], 2, -1))
-        self._mark_as_input(axis_const)
 
-        broadcast = core.GEOperatorFactory.create_operator(
-            "broadcast_to_d" + self._accumulated_op_id(),
-            "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor)
-        # unsqueeze cannot get right result, but ExpandDims seems have the same functionality.
-        reduce_sum_grad = core.GEOperatorFactory.create_operator(
-            "expand" + self._accumulated_op_id(), "ExpandDims").set_input(
-                "x", broadcast).set_input("axis", axis_const)
-        return [shape_tensor, axis_const, broadcast, reduce_sum_grad], [[3]]
+class DotDivParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotDivParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_div"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        div = core.GEOperatorFactory.create_operator(
+            "dotdiv" + self._accumulated_op_id(),
+            "Div").set_input("x1", x).set_input("x2", y)
+        return [div], [[0]]
 
 
-class MatMulParser(AscendParserBase):
+class DotPowParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(MatMulParser, self).__init__(graph, var2geop)
-        self.parser_name = "matmul"
+        super(DotPowParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_pow"
 
     def _apply(self):
-        x1 = self._get_ge_input(self.op.input_arg_names[0])
-        x2 = self._get_ge_input(self.op.input_arg_names[1])
-        matmul = core.GEOperatorFactory.create_operator(
-            "matmul" + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x1).set_input("x2", x2)
-        return [matmul], [[0]]
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        pow = core.GEOperatorFactory.create_operator(
+            "dotpow" + self._accumulated_op_id(),
+            "Pow").set_input("x1", x1).set_input("x2", y)
+        return [pow], [[0]]
 
 
-class MatMulGradParser(AscendParserBase):
+class LessParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(MatMulGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "matmul_grad"
+        super(LessParser, self).__init__(graph, var2geop)
+        self.parser_name = "less_than"
 
     def _apply(self):
-        out_grad = self._get_ge_input(self.op.input_arg_names[0])
-        x = self._get_ge_input(self.op.input_arg_names[1])
-        y = self._get_ge_input(self.op.input_arg_names[2])
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        less_than = core.GEOperatorFactory.create_operator(
+            "less_than" + self._accumulated_op_id(),
+            "Less").set_input("x1", x).set_input("x2", y)
+        return [less_than], [[0]]
 
-        x_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", out_grad).set_input("x2", y).set_attr_bool(
-                    "transpose_x1", False).set_attr_bool("transpose_x2", True)
-        y_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x).set_input("x2", out_grad).set_attr_bool(
-                    "transpose_x1", True).set_attr_bool("transpose_x2", False)
-        return [x_grad, y_grad], [[0], [1]]
 
+class MaxParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MaxParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_max"
 
-class MulGradParser(AscendParserBase):
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        max_out = core.GEOperatorFactory.create_operator(
+            "max" + self._accumulated_op_id(),
+            "Maximum").set_input("x1", x).set_input("x2", y)
+        return [max_out], [[0]]
+
+
+class MinParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(MulGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "mul_grad"
+        super(MinParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_min"
 
     def _apply(self):
-        out_grad = self._get_ge_input(self.op.input_arg_names[0])
-        x = self._get_ge_input(self.op.input_arg_names[1])
-        y = self._get_ge_input(self.op.input_arg_names[2])
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        min_out = core.GEOperatorFactory.create_operator(
+            "min" + self._accumulated_op_id(),
+            "Minimum").set_input("x1", x).set_input("x2", y)
+        return [min_out], [[0]]
 
-        x_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", out_grad).set_input("x2", y).set_attr_bool(
-                    "transpose_x1", False).set_attr_bool("transpose_x2", True)
-        y_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x).set_input("x2", out_grad).set_attr_bool(
-                    "transpose_x1", True).set_attr_bool("transpose_x2", False)
 
-        return [x_grad, y_grad], [[0], [1]]
+## cal
+class LogParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LogParser, self).__init__(graph, var2geop)
+        self.parser_name = "log"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        log = core.GEOperatorFactory.create_operator(
+            "log" + self._accumulated_op_id(), "Log").set_input("x", x)
+        return [log], [[0]]
+
+
+class SqrtParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SqrtParser, self).__init__(graph, var2geop)
+        self.parser_name = "sqrt"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        sqrt = core.GEOperatorFactory.create_operator(
+            "sqrt" + self._accumulated_op_id(), "Sqrt").set_input("x", x)
+        return [sqrt], [[0]]
+
+
+class PowParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(PowParser, self).__init__(graph, var2geop)
+        self.parser_name = "pow"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        factor = self.op.attr("factor")
+        pow_value = core.GEOperatorFactory.create_operator(
+            "pow" + self._accumulated_op_id(),
+            "Power").set_input("x", x).set_attr_float(
+                "power", factor).set_attr_float("scale", 1.0).set_attr_float(
+                    "shift", 0.0)
+        return [pow_value], [[0]]
+
+
+class SquareParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SquareParser, self).__init__(graph, var2geop)
+        self.parser_name = "square"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        square = core.GEOperatorFactory.create_operator(
+            "square" + self._accumulated_op_id(), "Square").set_input("x", x)
+        return [square], [[0]]
+
+
+class SumParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SumParser, self).__init__(graph, var2geop)
+        self.parser_name = "sum"
+
+    def _apply(self):
+        len_list = len(self.op.input_arg_names)
+        if len_list < 2:
+            assert False, "the size of input list must large or equal 2"
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        sum = core.GEOperatorFactory.create_operator(
+            "sum" + self._accumulated_op_id(),
+            "Add").set_input("x1", x).set_input("x2", y)
+        for i in range(2, len_list):
+            y = self._get_ge_input(self.op.input_arg_names[i])
+            sum = core.GEOperatorFactory.create_operator(
+                "sum" + self._accumulated_op_id(),
+                "Add").set_input("x1", sum).set_input("x2", y)
+        return [sum], [[0]]
+
+
+class LogicalNotParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LogicalNotParser, self).__init__(graph, var2geop)
+        self.parser_name = "logical_not"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        logical_not = core.GEOperatorFactory.create_operator(
+            "logical_not" + self._accumulated_op_id(),
+            "LogicalNot").set_input("x", x)
+        return [logical_not], [[0]]
+
+
+class MeanParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MeanParser, self).__init__(graph, var2geop)
+        self.parser_name = "mean"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        mean = core.GEOperatorFactory.create_operator(
+            "mean" + self._accumulated_op_id(),
+            "ReduceMeanD").set_input("x", x).set_attr_bool(
+                "keep_dims", False).set_attr_vec_int32("axes", [])
+        return [mean], [[0]]
+
+
+class ReduceSumParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceSumParser, self).__init__(graph, var2geop)
+        self.parser_name = "reduce_sum"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("dim")
+        keep_dims = self.op.attr("keep_dim")
+        reduce_all = self.op.attr("reduce_all")
+        x_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        if reduce_all:
+            axes = list(range(len(x_shape)))
+        reduce_sum = core.GEOperatorFactory.create_operator(
+            "reduce_sum" + self._accumulated_op_id(),
+            "ReduceSumD").set_input("x", x, 0).set_attr_vec_int32(
+                "axes", axes).set_attr_bool("keep_dims", keep_dims)
+        return [reduce_sum], [[0]]
+
+
+#class IncrementParser(AscendParserBase):
+#    def __init__(self, graph, var2geop):
+#        super(IncrementParser, self).__init__(graph, var2geop)
+#        self.parser_name = "increment"
+#
+#    def _apply(self): 
+#        x = self._get_ge_input(self.op.input_arg_names[0])
+#        step = self.op.attr("step") #self._get_ge_input(self.op.input_arg_names[1])
+#        print("step: ", step)
+#            
+#        increment = core.GEOperatorFactory.create_operator("adds" + self._accumulated_op_id(), "Adds").set_input("x", x).set_attr_float("value", step) #set_input("x2", bias)
+#        
+#        return [increment]
+
+
+## matrix cal
+class MatMulParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MatMulParser, self).__init__(graph, var2geop)
+        self.parser_name = "matmul"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        transpose_x = self.op.attr("transpose_X")
+        transpose_y = self.op.attr("transpose_Y")
+
+        x1_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        x2_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        if len(x1_shape) > 2:
+            matmul = core.GEOperatorFactory.create_operator(
+                "matmul" + self._accumulated_op_id(), "BatchMatMul").set_input(
+                    "x1", x).set_input("x2", y).set_attr_bool(
+                        "adj_x1",
+                        transpose_x).set_attr_bool("adj_x2", transpose_y)
+        elif len(x1_shape) == 2:
+            matmul = core.GEOperatorFactory.create_operator(
+                "matmul" + self._accumulated_op_id(),
+                "MatMul").set_input("x1", x).set_input("x2", y).set_attr_bool(
+                    "transpose_x1", transpose_x).set_attr_bool("transpose_x2",
+                                                               transpose_y)
+        else:
+            assert False, "not support"
+        return [matmul], [[0]]
 
 
 class MulParser(AscendParserBase):
@@ -275,13 +565,105 @@ class MulParser(AscendParserBase):
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
         y = self._get_ge_input(self.op.input_arg_names[1])
+        x_num_col_dims = self.op.attr("x_num_col_dims")
+        y_num_col_dims = self.op.attr("y_num_col_dims")
+        shape_x1 = self.op.block.var(self.op.input_arg_names[0]).shape
+        shape_x2 = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        if x_num_col_dims == 1 and y_num_col_dims == 1:
+            if len(shape_x1) == 2 and len(shape_x2) == 2:
+                matmul = core.GEOperatorFactory.create_operator(
+                    "mul" + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", x).set_input("x2", y)
+            elif len(shape_x1) == 3 and len(shape_x2) == 2:
+                flatten_x1 = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "Flatten").set_input("x", x)
+                matmul = core.GEOperatorFactory.create_operator(
+                    "mul" + self._accumulated_op_id(), "MatMul").set_input(
+                        "x1", flatten_x1, 0).set_input("x2", y, 0)
+            else:
+                assert False, "not support"
+        else:
+            if len(shape_x1) == 3 and len(shape_x2) == 2:
+                assert x_num_col_dims == 2, "only support 2"
+                flatten_x1 = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "FlattenV2").set_input("x", x).set_attr_int32(
+                        "axis", 0).set_attr_int32("end_axis", 1)
+                matmul_m = core.GEOperatorFactory.create_operator(
+                    "mul" + self._accumulated_op_id(), "MatMul").set_input(
+                        "x1", flatten_x1, 0).set_input("x2", y, 0)
+                matmul_transpose = core.GEOperatorFactory.create_operator(
+                    "transpose" + self._accumulated_op_id(),
+                    "TransposeD").set_input(
+                        "x", matmul_m).set_attr_vec_int32("perm", [1, 0])
+                tensor = self._create_ge_tensor(
+                    [3], 2, [shape_x2[1], shape_x1[0], shape_x1[1]])
+                const_shape = core.GEOperatorFactory.create_operator(
+                    "shape" + self._accumulated_op_id(),
+                    "Const").set_attr_tensor("value", tensor)
+                reshape_matmul = core.GEOperatorFactory.create_operator(
+                    "reshape" + self._accumulated_op_id(), "Reshape").set_input(
+                        "x", matmul_transpose).set_input(
+                            "shape", const_shape).set_attr_int32("axis", 0)
+                matmul = core.GEOperatorFactory.create_operator(
+                    "transpose" + self._accumulated_op_id(),
+                    "TransposeD").set_input(
+                        "x",
+                        reshape_matmul).set_attr_vec_int32("perm", [1, 2, 0])
+            else:
+                assert False, "not support"
 
-        matmul = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x).set_input("x2", y)
         return [matmul], [[0]]
 
 
+class LayerNormParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LayerNormParser, self).__init__(graph, var2geop)
+        self.parser_name = "layer_norm"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[2])
+        scale = self._get_ge_input(self.op.input_arg_names[1])
+        bias = self._get_ge_input(self.op.input_arg_names[0])
+        epsilon = self.op.attr("epsilon")
+        begin_norm_axis = self.op.attr("begin_norm_axis")
+        x_dtype = self.op.block.var(self.op.input_arg_names[2]).dtype
+
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+        scale_expand = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x",
+                                     scale).set_input("shape", shape_tensor)
+        bias_expand = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x", bias).set_input("shape", shape_tensor)
+        layer_norm = core.GEOperatorFactory.create_operator(
+            "layer_norm" + self._accumulated_op_id(),
+            "LayerNorm").set_input("x", x).set_input(
+                "gamma",
+                scale_expand).set_input("beta", bias_expand).set_attr_int32(
+                    "begin_norm_axis", begin_norm_axis).set_attr_int32(
+                        "begin_params_axis",
+                        begin_norm_axis).set_attr_float("epsilon", epsilon)
+
+        cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str(
+            x_dtype)] == 0 else 1
+        y = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", layer_norm, 0).set_attr_int32("dst_type", cast_dtype)
+        mean = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", layer_norm, 1).set_attr_int32("dst_type", cast_dtype)
+        variance = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", layer_norm, 2).set_attr_int32("dst_type", cast_dtype)
+        return [y, mean, variance], [[1], [2], [0]]
+
+
+## activate function
 class ReluParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(ReluParser, self).__init__(graph, var2geop)
@@ -294,20 +676,31 @@ class ReluParser(AscendParserBase):
         return [relu], [[0]]
 
 
-class ReluGradParser(AscendParserBase):
+class GeluParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ReluGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "relu_grad"
+        super(GeluParser, self).__init__(graph, var2geop)
+        self.parser_name = "gelu"
 
     def _apply(self):
-        out = self._get_ge_input(self.op.input_arg_names[0])
-        out_grad = self._get_ge_input(self.op.input_arg_names[1])
-        relu_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input(
-                "gradients", out_grad).set_input("features", out)
-        return [relu_grad], [[0]]
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        gelu = core.GEOperatorFactory.create_operator(
+            "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x)
+        return [gelu], [[0]]
+
+
+class TanhParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TanhParser, self).__init__(graph, var2geop)
+        self.parser_name = "tanh"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        tanh = core.GEOperatorFactory.create_operator(
+            "tanh" + self._accumulated_op_id(), "Tanh").set_input("x", x)
+        return [tanh], [[0]]
 
 
+## loss function
 class SoftmaxWithCrossEntropyParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(SoftmaxWithCrossEntropyParser, self).__init__(graph, var2geop)
@@ -316,80 +709,61 @@ class SoftmaxWithCrossEntropyParser(AscendParserBase):
     def _apply(self):
         label = self._get_ge_input(self.op.input_arg_names[0])
         logits = self._get_ge_input(self.op.input_arg_names[1])
-
         cls_num = self.op.block.var(self.op.input_arg_names[1]).shape[1]
+
         softmax = core.GEOperatorFactory.create_operator(
-            "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input(
-                "x", logits)
+            "softmax" + self._accumulated_op_id(),
+            "SoftmaxV2").set_input("x", logits)
         label = core.GEOperatorFactory.create_operator(
             "cast" + self._accumulated_op_id(), "Cast").set_input(
                 "x", label).set_attr_int32("dst_type", 3)
 
         tensoron = self._create_ge_tensor([1], 5, 1)
-        on_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoron)
-        self._mark_as_input(on_const)
+        on = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoron)
         tensoroff = self._create_ge_tensor([1], 5, 0)
-        off_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoroff)
-        self._mark_as_input(off_const)
+        off = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoroff)
+        self._mark_as_input(on)
+        self._mark_as_input(off)
         onehot = core.GEOperatorFactory.create_operator(
             "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
-                "x", label).set_input("on_value", on_const).set_input(
-                    "off_value", off_const).set_attr_int32("depth", cls_num)
+                "x", label).set_input("on_value", on).set_input(
+                    "off_value", off).set_attr_int32("depth", cls_num)
         squeeze = core.GEOperatorFactory.create_operator(
             "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot)
-        loss = core.GEOperatorFactory.create_operator(
+
+        loss_all = core.GEOperatorFactory.create_operator(
             "loss" + self._accumulated_op_id(),
             "SoftmaxCrossEntropyWithLogits").set_input(
                 "features", logits).set_input("labels", squeeze)
-
-        return [label, softmax, on_const, off_const, onehot, squeeze,
-                loss], [[6], [1]]
+        loss = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", loss_all, 0).set_attr_int32("dst_type", 0)
+        loss_expand = core.GEOperatorFactory.create_operator(
+            "unsqueeze" + self._accumulated_op_id(),
+            "Unsqueeze").set_input("x", loss).set_attr_vec_int32("axes", [1])
+        return [label, softmax, loss_expand], [[2], [1]]
 
 
-class SoftmaxWithCrossEntropyGradParser(AscendParserBase):
+class SoftMaxParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "softmax_with_cross_entropy_grad"
+        super(SoftMaxParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax"
 
     def _apply(self):
-        label = self._get_ge_input(self.op.input_arg_names[0])
-        loss_grad = self._get_ge_input(self.op.input_arg_names[1])
-        softmax = self._get_ge_input(self.op.input_arg_names[2])
-        cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1]
+        logits = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("axis")
 
-        tensoron = self._create_ge_tensor([1], 5, 1)
-        on_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoron)
-        self._mark_as_input(on_const)
-        tensoroff = self._create_ge_tensor([1], 5, 0)
-        off_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoroff)
-        self._mark_as_input(off_const)
-        label = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", label).set_attr_int32("dst_type", 3)
-        onehot = core.GEOperatorFactory.create_operator(
-            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
-                "x", label).set_input("on_value", on_const).set_input(
-                    "off_value", off_const).set_attr_int32("depth", cls_num)
-        # the fuck onehot will add a demension, so must call squeeze afterward
-        squeeze = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot)
-        sub = core.GEOperatorFactory.create_operator(
-            "sub" + self._accumulated_op_id(), "Sub").set_input(
-                "x1", softmax).set_input("x2", squeeze)
-        grad = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Mul").set_input(
-                "x1", loss_grad).set_input("x2", sub)
-        return [on_const, off_const, label, onehot, squeeze, sub, grad], [[-1]]
+        softmax = core.GEOperatorFactory.create_operator(
+            "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input(
+                "x", logits).set_attr_vec_int32("axes", [axes])
+        return [softmax], [[0]]
 
 
+## general 
 class ShapeParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(ShapeParser, self).__init__(graph, var2geop)
@@ -411,16 +785,15 @@ class FillConstantParser(AscendParserBase):
         shape = self.op.attr("shape")
         dtype = self.op.attr("dtype")
         value = self.op.attr("value")
-        print("shape: ", shape)
-        print("dtype: ", dtype)
-        print("value: ", value)
+
         tensor = self._create_ge_tensor(shape, dtype, value)
         const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor)
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
         self._mark_as_input(const)
         if self.op.block.var(self.op.output('Out')[0]).persistable:
-            print("%s fill_constant" % (self.op.output('Out')[0]))
+            #print("%s is Persistable in fill_constant" %
+            #      (self.op.output('Out')[0]))
             var = core.GEOperatorFactory.create_operator(
                 self.op.output('Out')[0], "Variable")
             var.update_output_desc("y",
@@ -432,26 +805,7 @@ class FillConstantParser(AscendParserBase):
                 "assign" + self._accumulated_op_id(), "Assign").set_input(
                     "value", const).set_input("ref", var)
             return [const], [[0]]
-        else:
-            print(
-                "self.op.output('Out')[0] is not persistable in fill_constant")
-            return [const], [[0]]
-
-
-class SGDParser(AscendParserBase):
-    def __init__(self, graph, var2geop):
-        super(SGDParser, self).__init__(graph, var2geop)
-        self.parser_name = "sgd"
-
-    def _apply(self):
-        grad = self._get_ge_input(self.op.input_arg_names[0])
-        lr = self._get_ge_input(self.op.input_arg_names[1])
-        param = self._get_ge_input(self.op.input_arg_names[2])
-        sgd = core.GEOperatorFactory.create_operator(
-            "momentum" + self._accumulated_op_id(),
-            "ApplyGradientDescent").set_input("var", param).set_input(
-                "alpha", lr).set_input("delta", grad)
-        return [sgd], [[0]]
+        return [const], [[0]]
 
 
 class TruncatedNormalParser(AscendParserBase):
@@ -465,30 +819,27 @@ class TruncatedNormalParser(AscendParserBase):
         mean = self.op.attr("mean")
         std = self.op.attr("std")
         seed = self.op.attr("seed")
+
         tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
         shape_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor1)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor1)
         tensor2 = self._create_ge_tensor([1], dtype, mean)
         mean_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor2)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor2)
         tensor3 = self._create_ge_tensor([1], dtype, std)
         std_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor3)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor3)
         tensor4 = self._create_ge_tensor([1], dtype, mean - 2 * std)
         min_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor4)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor4)
         tensor5 = self._create_ge_tensor([1], dtype, mean + 2 * std)
         max_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor5)
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor5)
 
         self._mark_as_input(shape_tensor)
         self._mark_as_input(mean_tensor)
@@ -507,9 +858,8 @@ class TruncatedNormalParser(AscendParserBase):
 
         ## wirte the output of truncatedNormal from startup_program to main_program
         if self.op.block.var(self.op.output('Out')[0]).persistable:
-            print("%s is Persistable in truncated_normal" %
-                  (self.op.output('Out')[0]))
-            #var = core.GEOperatorFactory.create_operator(self.op.output('Out')[0], "Variable").set_input("x", truncated_normal)
+            #print("%s is Persistable in truncated_normal" %
+            #      (self.op.output('Out')[0]))
             var = core.GEOperatorFactory.create_operator(
                 self.op.output('Out')[0], "Variable")
             var.update_output_desc("y",
@@ -524,66 +874,1317 @@ class TruncatedNormalParser(AscendParserBase):
                 shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor,
                 truncated_normal
             ], [[-1]]
-        else:
-            print(
-                "self.op.output('Out')[0] is not persistable in truncated_noraml"
-            )
-            return [truncated_normal], [[0]]  #[assign]
+        #else:
+        #    print(
+        #        "self.op.output('Out')[0] is not persistable in truncated_noraml"
+        #    )
+        return [truncated_normal], [[0]]
 
 
-class ScaleParser(AscendParserBase):
+class GatherParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ScaleParser, self).__init__(graph, var2geop)
-        self.parser_name = "scale"
+        super(GatherParser, self).__init__(graph, var2geop)
+        self.parser_name = "gather"
 
     def _apply(self):
-        x = self._get_ge_input(self.op.input_arg_names[0])
-        scale = self.op.attr(
-            "scale")  #self.get_ge_input(self.op.input_arg_names[1])
-        bias = self.op.attr("bias")
-        bias_after_scale = self.op.attr("bias_after_scale")
-        if bias_after_scale:
-            scale_value = core.GEOperatorFactory.create_operator(
-                "scale" + self._accumulated_op_id(), "Power").set_input(
-                    "x", x).set_attr_float("power", 1.0).set_attr_float(
+        index = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        clo = self.op.block.var(self.op.input_arg_names[1]).shape[-1]
+
+        gather = core.GEOperatorFactory.create_operator(
+            "gather" + self._accumulated_op_id(), "Gather").set_input(
+                "x", x).set_input("indices", index).set_attr_bool(
+                    "validate_indices", True)
+        return [gather], [[0]]
+
+
+class ScatterParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ScatterParser, self).__init__(graph, var2geop)
+        self.parser_name = "scatter"
+
+    def _apply(self):
+        index = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        updates = self._get_ge_input(self.op.input_arg_names[2])
+        overwrite = self.op.attr("overwrite")
+        index_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+
+        if len(index_shape) == 1:
+            index = core.GEOperatorFactory.create_operator(
+                "unsqueeze" + self.getid(), "Unsqueeze").set_input(
+                    "x", index).set_attr_vec_int32("axes", [1])
+        if not overwrite:
+            scatter_value = core.GEOperatorFactory.create_operator(
+                "scatter" + self._accumulated_op_id(),
+                "TensorScatterAdd").set_input(
+                    "x", x_var).set_input("indices", index_var).set_input(
+                        "updates", updatesi_var)
+        else:
+            scatter_value = core.GEOperatorFactory.create_operator(
+                "scatter" + self._accumulated_op_id(),
+                "TensorScatterUpdate").set_input(
+                    "x", x_var).set_input("indices", index_var).set_input(
+                        "updates", updates_var)
+        return [x_var, index_var, updates_var, scatter_value], [[-1]]
+
+
+class CastParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(CastParser, self).__init__(graph, var2geop)
+        self.parser_name = "cast"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        dtype = self.op.attr("out_dtype")
+        cast = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x).set_attr_int32("dst_type", dtype)
+        return [cast], [[0]]
+
+
+class AssignParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AssignParser, self).__init__(graph, var2geop)
+        self.parser_name = "assign"
+
+    def _apply(self):
+        const = self._get_ge_input(self.op.input_arg_names[0])
+        var = self._get_ge_input(self.op.input_arg_names[1])
+        assign = core.GEOperatorFactory.create_operator(
+            "assign" + self._accumulated_op_id(), "Assign").set_input(
+                "value", const).set_input("ref", var)
+        return [assign], [[0]]
+
+
+class ScaleParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ScaleParser, self).__init__(graph, var2geop)
+        self.parser_name = "scale"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        scale = self.op.attr("scale")
+        bias = self.op.attr("bias")
+        bias_after_scale = self.op.attr("bias_after_scale")
+
+        if bias_after_scale:
+            scale_value = core.GEOperatorFactory.create_operator(
+                "scale" + self._accumulated_op_id(), "Power").set_input(
+                    "x", x).set_attr_float("power", 1.0).set_attr_float(
                         "scale", scale).set_attr_float("shift", bias)
         else:
             x_add_bias = core.GEOperatorFactory.create_operator(
                 "adds" + self._accumulated_op_id(), "Adds").set_input(
-                    "x", x).set_attr_float("value",
-                                           bias)  #set_input("x2", bias)
+                    "x", x).set_attr_float("value", bias)
             scale_value = core.GEOperatorFactory.create_operator(
                 "scale" + self._accumulated_op_id(), "Power").set_input(
-                    "x", x_add_bias).set_attr_float(
-                        "power", 1.0).set_attr_float(
-                            "scale", scale).set_attr_float("shift", 0.0)
-            #tensor_zeros = core.GEOperatorFactory.create_operator("zeroslike" + self.getid(), "ZerosLike").set_input("x", x)
-            #bias_ = self.create_ge_tensor([1], 5, bias)     
-            #const_bias = core.GEOperatorFactory.create_operator("const" + self.getid(), "Const").set_attr_tensor("value", tensor_bias)
+                    "x",
+                    x_add_bias).set_attr_float("power", 1.0).set_attr_float(
+                        "scale", scale).set_attr_float("shift", 0.0)
         return [scale_value], [[0]]
 
 
+class SliceParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SliceParser, self).__init__(graph, var2geop)
+        self.parser_name = "slice"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("axes")
+        starts = self.op.attr("starts")
+        ends = self.op.attr("ends")
+
+        x_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        len_shape = len(x_shape)
+        axes_cor = list(range(len_shape))
+        starts_cor, ends_cor = [], []
+        cnt = 0
+        for i in range(len_shape):
+            starts_cor.append(starts[cnt] if i in axes else 0)
+            if i in axes and ends[cnt] <= x_shape[i]:
+                ends_cor.append(ends[cnt])
+            else:
+                ends_cor.append(x_shape[i])
+            if i in axes:
+                cnt += 1
+        size = [ends_cor[i] - starts_cor[i] for i in range(len(axes_cor))]
+
+        assert len(axes_cor) == len(starts_cor) == len(
+            ends_cor), "the three fields must have same size"
+        slice_value = core.GEOperatorFactory.create_operator(
+            "slice" + self._accumulated_op_id(), "SliceD").set_input(
+                "x", x).set_attr_vec_int32(
+                    "offsets", starts_cor).set_attr_vec_int32("size", size)
+
+        return [slice_value], [[0]]
+
+
 class ReshapeParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(ReshapeParser, self).__init__(graph, var2geop)
         self.parser_name = "reshape2"
 
     def _apply(self):
-        print("swbuf:", self.op.input_arg_names)
+        org_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        assert org_shape.count(-1) == 0, "do not allow the dim is -1"
         shape = self.op.attr("shape")
-        axis = 0
-        if shape[0] == -1:
-            axis = 1
-            shape = shape[1:]
-        print("shape: ", shape)
-        data_x1_shape = self._get_ge_input(self.op.input_arg_names[0])
+        for cnt in range(len(shape)):
+            if shape[cnt] == 0:
+                shape[cnt] = org_shape[cnt]
+
+        if -1 in shape:
+            assert shape.count(-1) == 1, "only allow one dim is -1"
+            mul_res_org = reduce(lambda x, y: x * y, org_shape)
+            mul_res_refine = reduce(lambda x, y: x * y, shape) * -1
+            idx = shape.index(-1)
+            shape[idx] = mul_res_org // mul_res_refine
+
+        x = self._get_ge_input(self.op.input_arg_names[0])
         tensor = self._create_ge_tensor([len(shape)], 2, shape)
         const_shape = core.GEOperatorFactory.create_operator(
-            "shape" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor)
+            "shape" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
         reshape = core.GEOperatorFactory.create_operator(
             "reshape" + self._accumulated_op_id(), "Reshape").set_input(
-                "x", data_x1_shape).set_input(
-                    "shape", const_shape).set_attr_int32("axis", axis)
+                "x",
+                x).set_input("shape", const_shape).set_attr_int32("axis", 0)
+        x_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+
+        return [x_shape, reshape], [[1], [0]]
+
+
+class TransposeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TransposeParser, self).__init__(graph, var2geop)
+        self.parser_name = "transpose2"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        perm = self.op.attr("axis")
+        transpose = core.GEOperatorFactory.create_operator(
+            "transpose" + self._accumulated_op_id(), "TransposeD").set_input(
+                "x", x).set_attr_vec_int32("perm", perm)
+        x_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+
+        return [x_shape, transpose], [[1], [0]]
+
+
+class AccuracyParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AccuracyParser, self).__init__(graph, var2geop)
+        self.parser_name = "accuracy"
+
+    def _apply(self):
+        pred = self._get_ge_input(self.op.input_arg_names[0])
+        label = self._get_ge_input(self.op.input_arg_names[1])
+        logits = self._get_ge_input(self.op.input_arg_names[2])
+
+        pred = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", pred).set_attr_int32("dst_type", 3)
+        label = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", label).set_attr_int32("dst_type", 3)
+        equal = core.GEOperatorFactory.create_operator(
+            "equal" + self._accumulated_op_id(), "Equal").set_input(
+                "x1", pred).set_input("x2", label)
+        cast = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", equal).set_attr_int32("dst_type", 0)
+        acc = core.GEOperatorFactory.create_operator(
+            "mean" + self._accumulated_op_id(), "ReduceMeanD").set_input(
+                "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32(
+                    "axes", [])
+        correct = core.GEOperatorFactory.create_operator(
+            "sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32(
+                    "axes", [])
+        ones_tensor = core.GEOperatorFactory.create_operator(
+            "oneslike" + self._accumulated_op_id(),
+            "OnesLike").set_input("x", label)
+        ones_tensor = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", ones_tensor).set_attr_int32("dst_type", 0)
+        total = core.GEOperatorFactory.create_operator(
+            "sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", ones_tensor).set_attr_bool(
+                    "keep_dims", False).set_attr_vec_int32("axes", [])
+
+        return [acc, correct, total], [[0], [1], [2]]
+
+
+class TopkParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TopkParser, self).__init__(graph, var2geop)
+        self.parser_name = "top_k"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        k = self.op.attr("k")
+
+        tensor = self._create_ge_tensor([1], 2, k)
+        const_k = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+        cast_x = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", x).set_attr_int32("dst_type", 1)
+        topk = core.GEOperatorFactory.create_operator(
+            "topk" + self._accumulated_op_id(),
+            "TopK").set_input("x", cast_x).set_input("k", const_k)
+        value = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", topk, 0).set_attr_int32("dst_type", 0)
+        index = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", topk, 1).set_attr_int32("dst_type", 0)
+        return [value, index], [[1], [0]]
+
+
+class LookupTableParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LookupTableParser, self).__init__(graph, var2geop)
+        self.parser_name = "lookup_table"
+
+    def _apply(self):
+        ids = self._get_ge_input(self.op.input_arg_names[0])
+        w = self._get_ge_input(self.op.input_arg_names[1])
+
+        ids_squeeze = core.GEOperatorFactory.create_operator(
+            "squeeze" + self._accumulated_op_id(), "Squeeze").set_input(
+                "x", ids).set_attr_vec_int32("axes", [-1])
+        out = core.GEOperatorFactory.create_operator(
+            "lookup" + self._accumulated_op_id(), "Gather").set_input(
+                "x", w).set_input("indices", ids_squeeze)
+        return [out], [[0]]
+
+
+class StackParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(StackParser, self).__init__(graph, var2geop)
+        self.parser_name = "stack"
+
+    def _apply(self):
+        tiles = len(self.op.input_arg_names)
+        data_x_lst = []
+        for index in range(tiles):
+            data_x_lst.append(
+                self._get_ge_input(self.op.input_arg_names[index]))
+        axis = self.op.attr("axis")
+
+        data_x = data_x_lst[0]
+        tensor = self._create_ge_tensor([1], 2, axis)
+        tensor_axis = core.GEOperatorFactory.create_operator(
+            "axis" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+        expand = core.GEOperatorFactory.create_operator(
+            "expand" + self._accumulated_op_id(),
+            "ExpandDims").set_input("x", data_x).set_input("axis", tensor_axis)
+
+        stack = core.GEOperatorFactory.create_operator(
+            "stack" + self._accumulated_op_id(),
+            "TileWithAxis").set_input("x", expand).set_attr_int32(
+                "axis", axis).set_attr_int32("tiles", tiles)
+
+        return [stack], [[0]]
+
+
+class UnSqueezeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(UnSqueezeParser, self).__init__(graph, var2geop)
+        self.parser_name = "unsqueeze2"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr('axes')
+
+        output = core.GEOperatorFactory.create_operator(
+            "unsqueeze" + self._accumulated_op_id(),
+            "Unsqueeze").set_input("x", x).set_attr_vec_int32("axes", axes)
+        shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", output)
+        return [shape, output], [[1], [0]]
+
+
+## parallel
+class AllGatherParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AllGatherParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_allgather"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        rank_size = self.op.attr("rank_size")
+        group = self.op.attr("group")
+
+        allgather = core.GEOperatorFactory.create_operator(
+            "allgather" + self._accumulated_op_id(), "HcomAllGather").set_input(
+                "x", x).set_attr_int32(
+                    "rank_size", rank_size).set_attr_string("group", group)
+        return [allgather], [[0]]
+
+
+class AllReduceParser(AscendParserBase):
+    def __init__(self, graph, var2geop, reduction):
+        super(AllReduceParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_allreduce_" + reduction
+        self.reduction = reduction
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        reduction = self.reduction
+        ring_id = self.op.attr("ring_id")
+        group = "hcom_group_" + str(ring_id)
+        fusion = None  #self.op.attr("fusion")
+        fusion_id = None  #self.op.attr("fusion_id")
+
+        allreduce = core.GEOperatorFactory.create_operator(
+            "allreduce" + self._accumulated_op_id(), "HcomAllReduce").set_input(
+                "x", x).set_attr_string(
+                    "reduction", reduction).set_attr_string("group", group)
+        if fusion is not None:
+            allreduce.set_attr_int32("fusion", fusion)
+
+        if fusion_id is not None:
+            allreduce.set_attr_int32("fusion_id", fusion_id)
+        return [allreduce], [[0]]
+
+
+class AllReduceSumParser(AllReduceParser):
+    def __init__(self, graph, var2geop):
+        super(AllReduceSumParser, self).__init__(graph, var2geop, 'sum')
+
+
+class AllReduceMaxParser(AllReduceParser):
+    def __init__(self, graph, var2geop):
+        super(AllReduceMaxParser, self).__init__(graph, var2geop, 'max')
+
+
+class BroadcastParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(BroadcastParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_broadcast"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        root_rank = self.op.attr("root_rank")
+        group = self.op.attr("group")
+
+        broadcast = core.GEOperatorFactory.create_operator(
+            "broadcast" + self._accumulated_op_id(), "HcomBroadcast").set_input(
+                "x", x).set_attr_int32(
+                    "root_rank", root_rank).set_attr_string("group", group)
+        return [broadcast], [[0]]
+
+
+class ReduceScatterParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceScatterParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_reduce_scatter"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        reduction = self.op.attr("reduction")
+        group = self.op.attr("group")
+        rank_size = self.op.attr("rank_size")
+
+        reduce_scatter = core.GEOperatorFactory.create_operator(
+            "reducescatter" + self._accumulated_op_id(),
+            "HcomReduceScatter").set_input("x", x).set_attr_string(
+                "reduction", reduction).set_attr_string(
+                    "group", group).set_attr_int32("rank_size", rank_size)
+        return [reduce_scatter], [[0]]
+
+
+class SendParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SendParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_send"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        sr_tag = self.op.attr("sr_tag")
+        dest_rank = self.op.attr("dest_rank")
+        group = self.op.attr("group")
+
+        send = core.GEOperatorFactory.create_operator(
+            "send" + self._accumulated_op_id(), "HcomSend").set_input(
+                "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32(
+                    "dest_rank", dest_rank).set_attr_string("group", group)
+        return [send], [[0]]
+
+
+class ReceiveParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReceiveParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_receive"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        sr_tag = self.op.attr("sr_tag")
+        src_rank = self.op.attr("src_rank")
+        group = self.op.attr("group")
+        shape = self.op.attr("shape")
+        dtype = self.op.attr("dtype")
+
+        receive = core.GEOperatorFactory.create_operator(
+            "receive" + self._accumulated_op_id(), "HcomReceive").set_input(
+                "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32(
+                    "src_rank", src_rank).set_attr_string(
+                        "group", group).set_attr_vec_int32(
+                            "shape", shape).set_attr_int32("dtype", dtype)
+        return [receive], [[0]]
+
+
+class RangeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(RangeParser, self).__init__(graph, var2geop)
+        self.parser_name = "range"
+
+    def _apply(self):
+        # TODO not support range type yet
+        start = self._get_ge_input(self.op.input_arg_names[0])
+        end = self._get_ge_input(self.op.input_arg_names[1])
+        delta = self._get_ge_input(self.op.input_arg_names[2])
+
+        ge_range = core.GEOperatorFactory.create_operator(
+            "range" + self._accumulated_op_id(), "Range")\
+              .set_input("start", end)\
+              .set_input("limit", start) \
+              .set_input("delta", delta)
+
+        return [ge_range], [[0]]
+
+
+class UniformRandomParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(UniformRandomParser, self).__init__(graph, var2geop)
+        self.parser_name = "uniform_random"
+
+    def _apply(self):
+        shape = self.op.attr("shape")
+
+        min_v = self.op.attr("min")
+        max_v = self.op.attr("max")
+        seed = self.op.attr("seed")
+        dtype = self.op.attr("dtype")
+        assert max_v > min_v, "assert max_v > min_v, but recieved " + \
+               "as max_v={}, min_v={} ".format(max_v, min_v)
+
+        tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor1)
+
+        ge_ur = core.GEOperatorFactory.create_operator(
+            "uniform_random" + self._accumulated_op_id(), "RandomUniform")\
+            .set_input("shape", shape_tensor)\
+            .set_attr_dtype("dtype", self.ascend_helper.dtype2ge(dtype))  \
+            .set_attr_int32("seed", seed)\
+            .set_attr_int32("seed2", seed)
+
+        scale = max_v - min_v
+
+        scale_value = core.GEOperatorFactory.create_operator(
+            "scale" + self._accumulated_op_id(), "Power").set_input(
+                "x", ge_ur).set_attr_float("power", 1.0).set_attr_float(
+                    "scale", scale).set_attr_float("shift", min_v)
+
+        return [scale_value], [[0]]
+
+
+class EqualParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(EqualParser, self).__init__(graph, var2geop)
+        self.parser_name = "equal"
+
+    def _apply(self):
+        data_x1 = self._get_ge_input(self.op.input_arg_names[0])
+        data_x2 = self._get_ge_input(self.op.input_arg_names[1])
+        equal = core.GEOperatorFactory.create_operator("equal" \
+           + self._accumulated_op_id(), "Equal")\
+             .set_input("x1", data_x1)\
+             .set_input("x2", data_x2)
+        return [equal], [[0]]
+
+
+class ExpandParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ExpandParser, self).__init__(graph, var2geop)
+        self.parser_name = "expand"
+
+    def _apply(self):
+        data_x1_shape = self._get_ge_input(self.op.input_arg_names[0])
+        expand_times = self.op.attr('expand_times')
+
+        tensor = self._create_ge_tensor([len(expand_times)], 2, expand_times)
+        expand_tensor = core.GEOperatorFactory.\
+           create_operator("const" + self._accumulated_op_id(), "Const")\
+              .set_attr_tensor("value", tensor)
+
+        assign = core.GEOperatorFactory\
+           .create_operator("tile" + self._accumulated_op_id(), "Tile")\
+              .set_input("x", data_x1_shape)\
+              .set_input("multiples", expand_tensor)
+        return [assign], [[0]]
+
+
+class SqueezeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SqueezeParser, self).__init__(graph, var2geop)
+        self.parser_name = "squeeze2"
+
+    def _apply(self):
+        tensor = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("axes")
+
+        data_squeezed = core.GEOperatorFactory\
+           .create_operator("squeeze" + self._accumulated_op_id(), "Squeeze")\
+             .set_input("x", tensor)\
+             .set_attr_vec_int32("axes", axes)
+        shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(),
+            "Shape").set_input("x", data_squeezed)
+        return [shape, data_squeezed], [[1], [0]]
+
+
+#****************************************************************#
+#***************************            *************************#
+#***************************            *************************#
+#*************************** GradParser *************************#
+#***************************            *************************#
+#***************************            *************************#
+#****************************************************************#
+## grad
+class ReduceSumGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceSumGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "reduce_sum_grad"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        input = self._get_ge_input(self.op.input_arg_names[1])
+
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(),
+            "Shape").set_input("x", input, 0)
+        tensoron = self._create_ge_tensor([1], 2, -1)
+        const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoron)
+        self._mark_as_input(const)
+
+        reduce_sum = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor)
+        #reduce_sum = core.GEOperatorFactory.create_operator("expand" + self._accumulated_op_id(), "ExpandDims").set_input("x", reduce_sum).set_input("axis", const)
+
+        return [reduce_sum], [[0]]
+
+
+class MatMulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MatMulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "matmul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[2])
+        transpose_x = self.op.attr("transpose_X")
+        transpose_y = self.op.attr("transpose_Y")
+
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        x_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        y_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        if len(x_shape) > 2:
+            if transpose_y:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "adj_x1", False).set_attr_bool("adj_x2", False)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", x).set_attr_bool(
+                            "adj_x1", True).set_attr_bool("adj_x2", False)
+            else:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "adj_x1", False).set_attr_bool("adj_x2", True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", x).set_input(
+                        "x2", out_grad).set_attr_bool(
+                            "adj_x1", True).set_attr_bool("adj_x2", False)
+        else:
+            if transpose_y:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "transpose_x1", False).set_attr_bool("transpose_x2",
+                                                                 False)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", x).set_attr_bool(
+                            "transpose_x1", True).set_attr_bool("transpose_x2",
+                                                                False)
+            else:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "transpose_x1", False).set_attr_bool("transpose_x2",
+                                                                 True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", x).set_input(
+                        "x2", out_grad).set_attr_bool(
+                            "transpose_x1", True).set_attr_bool("transpose_x2",
+                                                                False)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class MulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "mul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[2])
+        x_num_col_dims = self.op.attr("x_num_col_dims")
+        y_num_col_dims = self.op.attr("y_num_col_dims")
+
+        shape_out_grad = self.op.block.var(self.op.input_arg_names[0]).shape
+        shape_x = self.op.block.var(self.op.input_arg_names[1]).shape
+        shape_y = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        if x_num_col_dims == 1 and y_num_col_dims == 1:
+            if len(shape_x) == 2 and len(shape_y) == 2:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "transpose_x1", False).set_attr_bool("transpose_x2",
+                                                                 True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", x).set_input(
+                        "x2", out_grad).set_attr_bool(
+                            "transpose_x1", True).set_attr_bool("transpose_x2",
+                                                                False)
+            elif len(shape_x) == 3 and len(shape_y) == 2:
+                flatten_x = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "Flatten").set_input("x", x)
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input(
+                        "x1", out_grad).set_input("x2", y).set_attr_bool(
+                            "transpose_x1",
+                            False).set_attr_bool("transpose_x2", True)
+                if len(shape_out_grad) == 2:
+                    x_grad = core.GEOperatorFactory.create_operator(
+                        "unsqueeze" + self._accumulated_op_id(),
+                        "Unsqueeze").set_input("x", x_grad).set_attr_vec_int32(
+                            "axes", [1])
+
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input(
+                        "x1",
+                        flatten_x).set_input("x2", out_grad).set_attr_bool(
+                            "transpose_x1",
+                            True).set_attr_bool("transpose_x2", False)
+        else:
+            if len(shape_x) == 3 and len(shape_y) == 2:
+                assert x_num_col_dims == 2, "only support 2"
+                flatten_x = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "FlattenV2").set_input("x", x).set_attr_int32(
+                        "axis", 0).set_attr_int32("end_axis", 1)
+                flatten_out_grad = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "FlattenV2").set_input("x", out_grad).set_attr_int32(
+                        "axis", 0).set_attr_int32("end_axis", 1)
+
+                y_unsqueeze = core.GEOperatorFactory.create_operator(
+                    "unsqueeze" + self._accumulated_op_id(),
+                    "Unsqueeze").set_input("x",
+                                           y).set_attr_vec_int32("axes", [0])
+                y_stack = core.GEOperatorFactory.create_operator(
+                    "stack" + self._accumulated_op_id(),
+                    "TileWithAxis").set_input("x", y_unsqueeze).set_attr_int32(
+                        "axis", 0).set_attr_int32("tiles", shape_out_grad[0])
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", y_stack).set_attr_bool(
+                            "adj_x1", False).set_attr_bool("adj_x2", True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", flatten_x).set_input(
+                        "x2", flatten_out_grad).set_attr_bool(
+                            "transpose_x1",
+                            True).set_attr_bool("transpose_x2", False)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class ReluGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReluGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "relu_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        relu_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input(
+                "gradients", out_grad).set_input("features", out)
+        return [relu_grad], [[0]]
+
+
+class SoftmaxWithCrossEntropyGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax_with_cross_entropy_grad"
+
+    def _apply(self):
+        label = self._get_ge_input(self.op.input_arg_names[0])
+        loss_grad = self._get_ge_input(self.op.input_arg_names[1])
+        softmax = self._get_ge_input(self.op.input_arg_names[2])
+        cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1]
+
+        label_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        loss_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        softmax_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        tensoron = self._create_ge_tensor([1], 5, 1)
+        on = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoron)
+        tensoroff = self._create_ge_tensor([1], 5, 0)
+        off = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoroff)
+        self._mark_as_input(on)
+        self._mark_as_input(off)
+
+        label = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", label).set_attr_int32("dst_type", 3)
+        onehot = core.GEOperatorFactory.create_operator(
+            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
+                "x", label).set_input("on_value", on).set_input(
+                    "off_value", off).set_attr_int32("depth", cls_num)
+        squeeze = core.GEOperatorFactory.create_operator(
+            "suqeeze" + self._accumulated_op_id(),
+            "Squeeze").set_input("x", onehot)
+        sub = core.GEOperatorFactory.create_operator(
+            "sub" + self._accumulated_op_id(), "Sub").set_input(
+                "x1", softmax).set_input("x2", squeeze)
+        grad = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(),
+            "Mul").set_input("x1", loss_grad).set_input("x2", sub)
+
+        return [on, off, label, onehot, grad], [[-1]]
+
+
+class DotMulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotMulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_mul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        out_1 = self._get_ge_input(self.op.input_arg_names[1])
+        out_2 = self._get_ge_input(self.op.input_arg_names[2])
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "Mul").set_input("x1", out_grad).set_input("x2", out_2)
+        y_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "Mul").set_input("x1", out_1).set_input("x2", out_grad)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class DotAddGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotAddGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_add_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        out_1 = self._get_ge_input(self.op.input_arg_names[1])
+        out_2 = self._get_ge_input(self.op.input_arg_names[2])
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        out_1_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        out_2_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        x_grad = out_grad
+        cur_time_x = len(out_grad_shape) - len(out_1_shape)
+        for i in range(cur_time_x):
+            x_grad = core.GEOperatorFactory.create_operator(
+                self.parser_name + self._accumulated_op_id(),
+                "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32(
+                    "axes", [0]).set_attr_bool("keep_dims", False)
+        for axis, size in enumerate(out_1_shape):
+            if size == 1:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32(
+                        "axes", [axis]).set_attr_bool("keep_dims", True)
+
+        y_grad = out_grad
+        cur_time_y = len(out_grad_shape) - len(out_2_shape)
+        for i in range(cur_time_y):
+            y_grad = core.GEOperatorFactory.create_operator(
+                self.parser_name + self._accumulated_op_id(),
+                "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32(
+                    "axes", [0]).set_attr_bool("keep_dims", False)
+        for axis, size in enumerate(out_2_shape):
+            if size == 1:
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32(
+                        "axes", [axis]).set_attr_bool("keep_dims", True)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class DotDivGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotDivGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_div_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        x = self._get_ge_input(self.op.input_arg_names[2])
+        y = self._get_ge_input(self.op.input_arg_names[3])
+
+        y_power = core.GEOperatorFactory.create_operator(
+            "power" + self._accumulated_op_id(), "Power").set_input(
+                "x", y).set_attr_float("power", -1)
+
+        tensor_zeros = core.GEOperatorFactory.create_operator(
+            "zeroslike" + self._accumulated_op_id(),
+            "ZerosLike").set_input("x", x)
+        x_zero = core.GEOperatorFactory.create_operator(
+            "equal" + self._accumulated_op_id(), "Equal").set_input(
+                "x1", x).set_input("x2", tensor_zeros)
+        x_nozero = core.GEOperatorFactory.create_operator(
+            "logical_not" + self._accumulated_op_id(),
+            "LogicalNot").set_input("x", x_zero)
+        x_nozero_f = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_nozero).set_attr_int32("dst_type", 0)
+        x_grad_w = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", x_nozero_f).set_input("x2", y_power)
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "Mul").set_input("x1", x_grad_w).set_input("x2", out_grad)
+
+        y_grad_w = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", out).set_input("x2", y_power)
+        y_grad = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", y_grad_w).set_input("x2", out_grad)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class SoftmaxGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SoftmaxGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "SoftmaxGrad").set_input("softmax", out).set_input("grad_softmax",
+                                                               out_grad)
+        return [x_grad], [[0]]
+
+
+class ReshapeGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReshapeGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "reshape2_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x_shape = self._get_ge_input(self.op.input_arg_names[1])
+        x_shape_list = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        if x_shape_list[0] == 0:
+            x_shape_delzero = x_shape_list[1:]
+        tensor = self._create_ge_tensor([len(x_shape_delzero)], 2,
+                                        x_shape_delzero)
+        const_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+        x_grad = core.GEOperatorFactory.create_operator(
+            "reshape" + self._accumulated_op_id(), "Reshape").set_input(
+                "x", out_grad).set_input("shape", const_shape)
+
+        return [x_grad], [[0]]
 
-        return [reshape, reshape], [[0], [1]]
+
+class GatherGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(GatherGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "gather_grad"
+
+    def _apply(self):
+        index = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        x = self._get_ge_input(self.op.input_arg_names[2])
+
+        index_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        x_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        if len(index_shape) == 1:
+            index = core.GEOperatorFactory.create_operator(
+                "unsqueeze" + self._accumulated_op_id(), "Unsqueeze").set_input(
+                    "x", index).set_attr_vec_int32("axes", [1])
+
+        tensor_zeros = core.GEOperatorFactory.create_operator(
+            "zeroslike" + self._accumulated_op_id(),
+            "ZerosLike").set_input("x", x)
+        x_grad = core.GEOperatorFactory.create_operator(
+            "scatter" + self._accumulated_op_id(),
+            "TensorScatterUpdate").set_input("x", tensor_zeros).set_input(
+                "indices", index).set_input("updates", out_grad)
+
+        return [tensor_zeros, x_grad], [[-1]]
+
+
+class TransposeGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TransposeGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "transpose2_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        perm = self.op.attr("axis")
+
+        x_shape = self.op.block.var(self.op.input_arg_names[1]).shape[1:]
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        assert list(map(lambda x: out_grad_shape[x], perm)) == list(x_shape)
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            "transpose" + self._accumulated_op_id(), "TransposeD").set_input(
+                "x", out_grad).set_attr_vec_int32("perm", perm)
+
+        return [x_grad], [[0]]
+
+
+class LayerNormGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LayerNormGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "layer_norm_grad"
+
+    def _apply(self):
+        bias = self._get_ge_input(self.op.input_arg_names[0])
+        mean = self._get_ge_input(self.op.input_arg_names[1])
+        scale = self._get_ge_input(self.op.input_arg_names[2])
+        variance = self._get_ge_input(self.op.input_arg_names[3])
+        x = self._get_ge_input(self.op.input_arg_names[4])
+        out_grad = self._get_ge_input(self.op.input_arg_names[5])
+        x_dtype = self.op.block.var(self.op.input_arg_names[4]).dtype
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "LayerNormGrad").set_input("dy", out_grad).set_input(
+                "x", x).set_input("variance", variance).set_input(
+                    "mean", mean).set_input("gamma", scale)
+
+        cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str(
+            x_dtype)] == 0 else 1
+        out_x_grad = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_grad, 0).set_attr_int32("dst_type", cast_dtype)
+        out_scale_grad = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_grad, 1).set_attr_int32("dst_type", cast_dtype)
+        out_bias_grad = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_grad, 2).set_attr_int32("dst_type", cast_dtype)
+
+        return [out_x_grad, out_scale_grad, out_bias_grad], [[2], [1], [0]]
+
+
+class TanhGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TanhGradParser, self).__init__(graph, var2geop)
+        self.parser_name = 'tanh_grad'
+
+    def _apply(self):
+        y = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        tanh_grad = core.GEOperatorFactory.create_operator(
+            "tanh_grad" + self._accumulated_op_id(),
+            "TanhGrad").set_input("y", y).set_input("dy", out_grad)
+
+        return [tanh_grad], [[0]]
+
+
+class LogGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LogGradParser, self).__init__(graph, var2geop)
+        self.parser_name = 'log_grad'
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        input = self._get_ge_input(self.op.input_arg_names[1])
+        log_grad = core.GEOperatorFactory.create_operator(
+            "log_grad" + self._accumulated_op_id(),
+            "DivNoNan").set_input("x1", grad).set_input("x2", input)
+        return [log_grad], [[0]]
+
+
+class SqrtGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SqrtGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "sqrt_grad"
+
+    def _apply(self):
+        y = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        sqrt_grad = core.GEOperatorFactory.create_operator(
+            "sqrt_grad" + self._accumulated_op_id(),
+            "SqrtGrad").set_input("y", y).set_input("dy", out_grad)
+        return [sqrt_grad]
+
+
+class PowGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(PowGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "pow_grad"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        factor = self.op.attr("factor")
+
+        shape_tensor = self._create_shape_tensor()
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+        factor_scale = self._create_ge_tensor([1], 5, factor)
+        factor_scale = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", factor_scale)
+        factor_tensor = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input(
+                "x", factor_scale).set_input("shape", shape_tensor)
+
+        x_power = core.GEOperatorFactory.create_operator(
+            "x_power" + self._accumulated_op_id(), "Power").set_input(
+                "x", x).set_attr_float("power", factor - 1)
+        x_power_mul_factor = core.GEOperatorFactory.create_operator(
+            "x_power_mul_factor" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", x).set_input("x2", factor_tensor)
+        x_power_mul_factor_grad = core.GEOperatorFactory.create_operator(
+            "x_power_mul_factor_grad" + self._accumulated_op_id(),
+            "Mul").set_input("x1", x_power_mul_factor).set_input("x2", grad)
+
+        return [x_power_mul_factor_grad], [[0]]
+
+
+class GeluGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(GeluGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "gelu_grad"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+
+        y = core.GEOperatorFactory.create_operator(
+            "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x)
+        gelu_grad = core.GEOperatorFactory.create_operator(
+            "gelu_grad" + self._accumulated_op_id(), "GeluGrad").set_input(
+                "x", x).set_input("dy", grad).set_input("y", y)
+
+        return [gelu_grad], [[0]]
+
+
+class MeanGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MeanGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "mean_grad"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+
+        ones_tensor = core.GEOperatorFactory.create_operator(
+            "one_tensor" + self._accumulated_op_id(),
+            "OnesLike").set_input("x", x)
+        sum = core.GEOperatorFactory.create_operator(
+            "mean" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", ones_tensor).set_attr_bool(
+                    "keep_dims", False).set_attr_vec_int32("axes", [])
+        mean = core.GEOperatorFactory.create_operator(
+            "x_power" + self._accumulated_op_id(), "Power").set_input(
+                "x", sum).set_attr_float("power", -1)
+
+        mean_grad = core.GEOperatorFactory.create_operator(
+            "mean_grad" + self._accumulated_op_id(),
+            "Mul").set_input("x1", mean).set_input("x2", grad)
+
+        return [mean_grad], [[0]]
+
+
+class SliceGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SliceGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "slice_grad"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        grad = self._get_ge_input(self.op.input_arg_names[1])
+        axes = self.op.attr("axes")
+        starts = self.op.attr("starts")
+        ends = self.op.attr("ends")
+
+        x_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        len_shape = len(x_shape)
+        axes_cor = list(range(len_shape))
+        starts_cor, ends_cor = [], []
+        cnt = 0
+        for i in range(len_shape):
+            starts_cor.append(starts[cnt] if i in axes else 0)
+            if i in axes and ends[cnt] <= x_shape[i]:
+                ends_cor.append(x_shape[i] - ends[cnt])
+            else:
+                ends_cor.append(0)
+            if i in axes:
+                cnt += 1
+
+        starts_cor[0] = 0
+        ends_cor[0] = 0
+        paddings = [[s, e] for (s, e) in zip(starts_cor, ends_cor)]
+        slice_value = core.GEOperatorFactory.create_operator(
+            "slice_grad" + self._accumulated_op_id(), "PadD").set_input(
+                "x", grad).set_attr_vec_vec_int64("paddings", paddings)
+
+        return [slice_value], [[0]]
+
+
+class LookUpTableGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LookUpTableGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "lookup_table_grad"
+
+    def _apply(self):
+        ids = self._get_ge_input(self.op.input_arg_names[0])
+        grad = self._get_ge_input(self.op.input_arg_names[1])
+        embedding = self._get_ge_input(self.op.input_arg_names[2])
+
+        shape_ids = self.op.block.var(self.op.input_arg_names[0]).shape
+        shape_grad = self.op.block.var(self.op.input_arg_names[1]).shape
+        shape_embedding = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        ids_flatten = core.GEOperatorFactory.create_operator(
+            "flatten" + self._accumulated_op_id(), "FlattenV2").set_input(
+                "x",
+                ids).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1)
+        grad_flatten = core.GEOperatorFactory.create_operator(
+            "flatten" + self._accumulated_op_id(), "FlattenV2").set_input(
+                "x",
+                grad).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1)
+
+        tensor_zeros = core.GEOperatorFactory.create_operator(
+            "zeroslike" + self._accumulated_op_id(),
+            "ZerosLike").set_input("x", embedding)
+        embedding_grad = core.GEOperatorFactory.create_operator(
+            "scatteradd" + self._accumulated_op_id(),
+            "TensorScatterAdd").set_input(
+                "x", tensor_zeros).set_input("indices", ids_flatten).set_input(
+                    "updates", grad_flatten)
+
+        return [embedding_grad], [[0]]
+
+
+class SGDParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SGDParser, self).__init__(graph, var2geop)
+        self.parser_name = "sgd"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        lr = self._get_ge_input(self.op.input_arg_names[1])
+        param = self._get_ge_input(self.op.input_arg_names[2])
+        sgd = core.GEOperatorFactory.create_operator(
+            "momentum" + self._accumulated_op_id(),
+            "ApplyGradientDescent").set_input("var", param).set_input(
+                "alpha", lr).set_input("delta", grad)
+        return [sgd], [[0]]
+
+
+class AdamParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AdamParser, self).__init__(graph, var2geop)
+        self.parser_name = "adam"
+
+    def _apply(self):
+        beta1_power = self._get_ge_input(self.op.input_arg_names[0])
+        beta2_power = self._get_ge_input(self.op.input_arg_names[1])
+        grad = self._get_ge_input(self.op.input_arg_names[2])
+        lr = self._get_ge_input(self.op.input_arg_names[3])
+        moment1 = self._get_ge_input(self.op.input_arg_names[4])
+        moment2 = self._get_ge_input(self.op.input_arg_names[5])
+        param = self._get_ge_input(self.op.input_arg_names[6])
+        beta1 = self.op.attr('beta1')
+        beta2 = self.op.attr('beta2')
+        epsilon = self.op.attr('epsilon')
+
+        beta1 = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 5, beta1))
+        beta2 = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 5, beta2))
+        epsilon = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 5, epsilon))
+
+        adam = core.GEOperatorFactory.create_operator(
+            "adam" + self._accumulated_op_id(),
+            "ApplyAdam").set_input("var", param).set_input(
+                "m", moment1).set_input("v", moment2).set_input(
+                    "beta1_power", beta1_power).set_input(
+                        "beta2_power", beta2_power).set_input(
+                            "lr", lr).set_input("beta1", beta1).set_input(
+                                "beta2", beta2).set_input(
+                                    "epsilon", epsilon).set_input("grad", grad)
+
+        return [adam], [[0]]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 00d58cbd997fb4a5d5aba0f0c7e5d5cc34bd7b99..707284a784c38e5ac7b0f3b8248ca03b6c4506bb 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -13,11 +13,14 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
 from ..base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 OpRole = core.op_proto_and_checker_maker.OpRole
 
 OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -47,7 +50,7 @@ def is_optimizer_op(op):
 
 
 class CollectiveHelper(object):
-    def __init__(self, role_maker, nrings=1, wait_port='6174'):
+    def __init__(self, role_maker, nrings=1, wait_port=True):
         self.nrings = nrings
         self.wait_port = wait_port
         self.role_maker = role_maker
@@ -65,14 +68,54 @@ class CollectiveHelper(object):
                 self.role_maker._worker_index(), ring_id, self.wait_port)
         self._broadcast_params()
 
-    def _init_communicator(self, program, current_endpoint, endpoints, rank,
-                           ring_id, wait_port):
+    def _init_communicator(self,
+                           program,
+                           current_endpoint,
+                           endpoints,
+                           rank,
+                           ring_id,
+                           wait_port,
+                           global_ring_id=None,
+                           sync=True):
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
+
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
+        def _add_sync_by_allreduce(block):
+            sync_var = block.create_var(
+                name=unique_name.generate('sync_var'),
+                dtype=core.VarDesc.VarType.INT32,
+                persistable=False,
+                stop_gradient=True)
+            block.append_op(
+                type='fill_constant',
+                inputs={},
+                outputs={'Out': [sync_var]},
+                attrs={
+                    'shape': [1],
+                    'dtype': sync_var.dtype,
+                    'value': 1,
+                    'force_cpu': False,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_allreduce_sum',
+                inputs={'X': [sync_var]},
+                outputs={'Out': [sync_var]},
+                attrs={
+                    'ring_id': global_ring_id,
+                    'use_calc_stream': True,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_sync_calc_stream',
+                inputs={'X': sync_var},
+                outputs={'Out': sync_var},
+                attrs={OP_ROLE_KEY: OpRole.Forward})
+
         block = program.global_block()
         if core.is_compiled_with_cuda():
             comm_id_var = block.create_var(
@@ -124,10 +167,38 @@ class CollectiveHelper(object):
                     'ring_id': ring_id,
                     OP_ROLE_KEY: OpRole.Forward
                 })
+        elif core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+            block.append_op(
+                type='c_gen_hccl_id',
+                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
+                outputs={},
+                attrs={
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    'device_id': int(os.getenv("FLAGS_selected_npus")),
+                    'rank_ids': nranks,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
         else:
             raise ValueError(
                 "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
             )
+        if sync: _add_sync_by_allreduce(block)
 
     def _wait(self, current_endpoint, endpoints):
         assert (self.wait_port)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 7bd6832556933483b356d814658885e1f1fe1dbd..b035f179317ac46b5673e260b8e492af5cb418d9 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -15,6 +15,8 @@ from paddle.fluid.optimizer import Momentum, DGCMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
+__all__ = []
+
 
 class DGCOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0f26bd2e0d06014750daa3f75101e64c77d86f5
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
@@ -0,0 +1,16 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from .hybrid_parallel_optimizer import HybridParallelOptimizer
+from .hybrid_parallel_gradscaler import HybridParallelGradScaler
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0e8034f5cae152f9f8f38b62ad802d1ae8dcf4f
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+from paddle.optimizer import Optimizer
+from ...base.topology import ParallelMode
+from paddle.fluid.dygraph import base as imperative_base
+from paddle.fluid import framework
+from paddle.fluid.framework import Variable
+import types
+from paddle.fluid import core
+import paddle
+
+__all__ = []
+
+
+class HybridParallelGradScaler:
+    def __init__(self, scaler, hcg):
+        self._scaler = scaler
+        self._hcg = hcg
+        self._is_mp = (
+            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+
+    def scale(self, var):
+        return self._scaler.scale(var)
+
+    def minimize(self, optimizer, *args, **kwargs):
+        if not self._enable:
+            return optimizer.minimize(*args, **kwargs)
+
+        #  unscale the grad
+        self._unscale(optimizer)
+
+        optimize_ops, params_grads = (None, None)
+
+        if self._found_inf:
+            self._cache_founf_inf = True
+        else:
+            optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            self._cache_founf_inf = False
+
+        if self._use_dynamic_loss_scaling:
+            self._update()
+
+        return optimize_ops, params_grads
+
+    @imperative_base.no_grad
+    def _unscale(self, optimizer):
+        if not self._enable:
+            return
+        param_grads = [
+            param._grad_ivar() for param in optimizer._parameter_list
+            if param._grad_ivar() is not None
+        ]
+        core.ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
+                                          self._found_inf)
+        # allreduce_max found_inf in check_group
+        if self._is_mp:
+            self._found_inf = paddle.cast(self._found_inf, dtype="int32")
+            # TODO(shenliang03) Since the minimize call in the optimizer is 
+            # after the gradscaler, check_finite needs to synchronize global 
+            # information. In the future, we should use check_group
+            paddle.distributed.all_reduce(
+                self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)
+            self._found_inf = paddle.cast(self._found_inf, dtype="bool")
+
+    def __getattr__(self, item):
+        return getattr(self._scaler, item)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7ac298d2223eea23b203de7299064e200e7282e
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+from paddle.optimizer import Optimizer
+from paddle.fluid.clip import ClipGradByGlobalNorm
+from ...utils.hybrid_parallel_util import fused_allreduce_gradients
+from ...base.topology import ParallelMode
+from paddle.fluid.dygraph import base as imperative_base
+from paddle.fluid import framework
+from paddle.fluid.framework import Variable
+from ...utils.log_util import logger
+
+__all__ = []
+
+
+class HybridParallelClipGrad:
+    def __init__(self, clip, hcg):
+        self._clip = clip
+        self._hcg = hcg
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                continue
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.merge_selected_rows(g)
+                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+            sum_square_list.append(sum_square)
+
+        # all parameters have been filterd out
+        if len(sum_square_list) == 0:
+            return params_grads
+
+        global_norm_var = layers.concat(sum_square_list)
+        global_norm_var = layers.reduce_sum(global_norm_var)
+        # add all reduce to get global norm in world size
+        paddle.distributed.all_reduce(global_norm_var,
+                                      self._hcg.get_check_parallel_group())
+        global_norm_var = layers.sqrt(global_norm_var)
+
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(
+                x=global_norm_var, y=max_global_norm))
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+        return params_and_grads
+
+    def __getattr__(self, item):
+        return getattr(self._clip, item)
+
+    def __call__(self, params_grads):
+        return self._clip(params_grads)
+
+
+class HybridParallelOptimizer:
+    # adapter wrapper for optimizer
+    def __init__(self, optimizer, hcg, strategy):
+        self._inner_opt = optimizer
+        self._strategy = strategy
+        self._hcg = hcg
+        self._is_mp = (
+            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+        self._need_dp = (self._hcg.get_data_parallel_world_size() > 1)
+
+        if isinstance(self._inner_opt._grad_clip,
+                      ClipGradByGlobalNorm) and self._is_mp:
+            logger.warning("using ClipGradByGlobalNorm in ModelParallel, the origin " \
+                  "optmizer'grad clip will be changed.")
+            self._inner_opt._grad_clip = HybridParallelClipGrad(
+                self._inner_opt._grad_clip, hcg)
+
+    @imperative_base.no_grad
+    @framework.dygraph_only
+    def step(self):
+        if self._is_mp and self._need_dp:
+            fused_allreduce_gradients(
+                list(self._inner_opt._parameter_list), self._hcg)
+        self._inner_opt.step()
+
+    @imperative_base.no_grad
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        assert isinstance(loss, Variable), "The loss should be an Tensor."
+
+        parameter_list = parameters if parameters \
+            else self._parameter_list
+
+        if self._is_mp and self._need_dp:
+            fused_allreduce_gradients(list(parameter_list), self._hcg)
+
+        return self._inner_opt.minimize(loss, startup_program, parameters,
+                                        no_grad_set)
+
+    def __getattr__(self, item):
+        return getattr(self._inner_opt, item)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
index 411980ed01322ac56813efcd0684bb12e9c8761b..f636a313757854652da404412374580257d3fe53 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
@@ -14,6 +14,8 @@
 from paddle.fluid import core, framework, unique_name
 from .meta_optimizer_base import MetaOptimizerBase
 
+__all__ = []
+
 
 class FP16AllReduceOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index 380fbc2e09ebffe89cfaabfd8d753dc47e8d85ff..949ef3e5f3a78fc9202f77ec3e045a3273786949 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -14,6 +14,8 @@
 from paddle.fluid.optimizer import GradientMergeOptimizer as GM
 from .meta_optimizer_base import MetaOptimizerBase
 
+__all__ = []
+
 
 class GradientMergeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 159c0b973b2b72c1289efc4c6f4cb9dc233cdefa..4194cf13d2bbcdeada650735738d7ba8e1847cd2 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -19,6 +19,8 @@ from .meta_optimizer_base import MetaOptimizerBase
 from ..base.private_helper_function import wait_server_ready
 import logging
 
+__all__ = []
+
 
 class GraphExecutionOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
@@ -61,8 +63,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
         trainer_endpoints_env = ",".join(trainer_endpoints)
         trainers_num = self.role_maker._worker_num()
 
-        if trainer_id == 0:
-            wait_server_ready(other_trainers)
+        # FIXME(wangxi): approve this.
+        #if trainer_id == 0:
+        #    wait_server_ready(other_trainers)
 
         if core.is_compiled_with_cuda():
             comm_id_var = startup_program.global_block().create_var(
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 64d54ae3bab03b4511340c3ae222001aa7942f9c..6d2474d9352f874aee89d796fd20d5546c899326 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -16,6 +16,8 @@ from paddle.fluid.optimizer import LambOptimizer as LAMB
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
+__all__ = []
+
 
 class LambOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index 32c6be505a5467b2fe6cc3f155cc8df7e21bfeca..e1bf3722c191d18fa7166eeafc1154c2b20f64f7 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -15,6 +15,8 @@ from paddle.fluid.optimizer import Momentum, LarsMomentumOptimizer
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
+__all__ = []
+
 
 class LarsOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 91030f07629343497426268106650ccb3f5011fd..3340672e0f925b11948e1751afdbeb3f9f642836 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -19,6 +19,8 @@ from paddle.fluid import program_guard, layers, default_main_program
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, CollectiveHelper, is_update_op
 
+__all__ = []
+
 
 class LocalSGDOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index a12ca50442b1c3499d62216d1fecc709f3351382..3bbaa055c5e597bb8ecdb86dd283f5c935b25df2 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -14,6 +14,8 @@
 
 from paddle.fluid.optimizer import Optimizer
 
+__all__ = []
+
 
 class MetaOptimizerBase(Optimizer):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index dfa765364f357b6e685c3983c73cfb4f1b2cce61..ba2a0e84c7ab6b20c50e5354e6cdabd63902cb5a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -15,6 +15,8 @@ from paddle import fluid
 from paddle.fluid import compiler
 from .parameter_server_optimizer import ParameterServerOptimizer
 
+__all__ = []
+
 
 class ParameterServerGraphOptimizer(ParameterServerOptimizer):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index dd13f9bc5d4e759a7b4352474b6be37369a380fc..88180221ff4ff550ba8ff0b1b7af153c06c8c272 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -20,6 +20,8 @@ import os
 import platform
 from ..base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 
 class ParameterServerOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
@@ -38,6 +40,23 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
         return True if k_steps >= 0 else False
 
+    def get_dist_env(self):
+        trainer_id = int(os.getenv('PADDLE_TRAINER_ID', '0'))
+        trainer_endpoints = ''
+        current_endpoint = ''
+        num_trainers = 0
+        if os.getenv('PADDLE_TRAINER_ENDPOINTS'):
+            trainer_endpoints = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+            current_endpoint = trainer_endpoints.split(',')[trainer_id]
+            num_trainers = len(trainer_endpoints.split(','))
+
+        return {
+            'trainer_id': trainer_id,
+            'num_trainers': num_trainers,
+            'current_endpoint': current_endpoint,
+            'trainer_endpoints': trainer_endpoints
+        }
+
     def _get_distributed_strategy(self):
         from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
 
@@ -64,6 +83,8 @@ class ParameterServerOptimizer(MetaOptimizerBase):
         _main = compiled_config.origin_main_program.clone()
         _startup = compiled_config.origin_startup_program.clone()
 
+        use_ps_gpu = self.user_defined_strategy.a_sync_configs["use_ps_gpu"]
+
         if not compiled_config.is_geo_mode():
             from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass
             _add_lr_decay_table_pass(
@@ -71,14 +92,28 @@ class ParameterServerOptimizer(MetaOptimizerBase):
                 self.user_defined_strategy.a_sync_configs["lr_decay_steps"])
 
             # for main program
-            _main = worker.delete_optimizer_pass(_main, compiled_config)
-            _main = worker.distributed_ops_pass(_main, compiled_config)
-            _main = worker.append_send_ops_pass(_main, compiled_config)
-
-            # for startup program
+            _main = worker.distributed_ops_pass(_main, compiled_config,
+                                                use_ps_gpu)
+            if not use_ps_gpu:
+                _main = worker.delete_optimizer_pass(_main, compiled_config)
+                _main = worker.append_send_ops_pass(_main, compiled_config)
+                _startup = worker.delet_extra_optimizes_pass(_startup,
+                                                             compiled_config)
+
+                # for startup program
             _startup = worker.fake_init_ops_pass(_startup, compiled_config)
-            _startup = worker.delet_extra_optimizes_pass(_startup,
-                                                         compiled_config)
+            if use_ps_gpu:
+                _main = worker.ps_gpu_pass(_main)
+                from paddle.fluid.transpiler.collective import SingleProcessMultiThread
+                t = SingleProcessMultiThread()
+                env = self.get_dist_env()
+                t.transpile(
+                    startup_program=_startup,
+                    main_program=_main,
+                    rank=env["trainer_id"],
+                    endpoints=env["trainer_endpoints"],
+                    current_endpoint=env['current_endpoint'],
+                    wait_port=False)
 
             compiled_config.set_origin_ps_main_program(_main)
             compiled_config.set_origin_ps_startup_program(_startup)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
old mode 100644
new mode 100755
index 1b79de03fdfb50647b045e40fdd62c043d52a259..a0bf4cc5bc0975d7d3b88039d3a5603f28584a1a
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -13,133 +13,30 @@
 
 from __future__ import print_function
 from __future__ import division
+import os
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
 from ..base.private_helper_function import wait_server_ready
 from paddle.fluid.optimizer import PipelineOptimizer as PO
 from .meta_optimizer_base import MetaOptimizerBase
-from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
 
-
-def _get_node_num(endpoints):
-    ss = set()
-    for ep in endpoints:
-        ip = ep.split(":")[0].strip()
-        if ip not in ss:
-            ss.add(ip)
-    return len(ss)
-
-
-class PipelineHelper(object):
-    def __init__(self, role_maker, wait_port='6174'):
-        self.wait_port = wait_port
-        self.role_maker = role_maker
-
-    def update_startup_program(self,
-                               startup_program=None,
-                               inner_parallelism=None):
-        self.startup_program = startup_program
-
-        nranks = self.role_maker._worker_num()
-        rank = self.role_maker._worker_index()
-        endpoints = self.role_maker._get_trainer_endpoints()
-        current_endpoint = endpoints[rank]
-        node_num = _get_node_num(endpoints)
-        assert nranks % node_num == 0
-
-        # Create ring 0 for all gpus in the same pipeline
-        if inner_parallelism > 1:
-            pipeline_rank = rank % inner_parallelism
-            pipeline_id = rank // inner_parallelism
-            start_index = pipeline_id * inner_parallelism
-            pipeline_endpoints = endpoints[start_index:start_index +
-                                           inner_parallelism]
-            self._init_communicator(self.startup_program, current_endpoint,
-                                    pipeline_endpoints, pipeline_rank, 0,
-                                    self.wait_port)
-
-        pipeline_num = len(endpoints) // inner_parallelism
-        if pipeline_num == 1: return
-        # Create rings for gpus with the same pipeline id for data parallel
-        eps = []
-        pipeline_rank = rank % inner_parallelism
-        ring_id = pipeline_rank + 1
-        for i in range(pipeline_num):
-            eps.append(endpoints[i * inner_parallelism + pipeline_rank])
-        # rank in a ring of gpus with the same pipeline id for data parallel
-        dp_rank = rank // inner_parallelism
-        self._init_communicator(self.startup_program, current_endpoint, eps,
-                                dp_rank, ring_id, self.wait_port)
-        self._broadcast_params(ring_id)
-
-    def _init_communicator(self, program, current_endpoint, endpoints, rank,
-                           ring_id, wait_port):
-        nranks = len(endpoints)
-        other_endpoints = endpoints[:]
-        other_endpoints.remove(current_endpoint)
-        if rank == 0 and wait_port:
-            wait_server_ready(other_endpoints)
-
-        block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=unique_name.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-                OP_ROLE_KEY: OpRole.Forward,
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': ring_id,
-                OP_ROLE_KEY: OpRole.Forward,
-            })
-
-    def _broadcast_params(self, ring_id):
-        block = self.startup_program.global_block()
-        for var_name in block.vars:
-            if "nccl_id" in var_name: continue
-            param = block.var(var_name)
-            if not param.persistable:
-                continue
-
-            block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': ring_id,
-                    'root': 0,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
-
-        block.append_op(
-            type='c_sync_comm_stream',
-            inputs={'X': param},
-            outputs={'Out': param},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Forward})
+__all__ = []
 
 
 class PipelineOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
         super(PipelineOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
-        # we do not allow meta optimizer to be inner optimizer currently
-        self.meta_optimizers_white_list = []
+        self.meta_optimizers_white_list = [
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+        ]
         self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.global_ring_id = 1
+        self.dp_ring_id = 2
+        self.start_pipeline_ring_id = 20  # Just a magic number
 
     def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
                         user_defined_strategy):
@@ -149,80 +46,167 @@ class PipelineOptimizer(MetaOptimizerBase):
             'micro_batch_size']
         self.num_microbatches = user_defined_strategy.pipeline_configs[
             'accumulate_steps']
+        self.schedule_mode = user_defined_strategy.pipeline_configs[
+            'schedule_mode']
+        self.use_sharding = user_defined_strategy.sharding
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
             return False
 
+        # FIXME revise for hybrid parallelism
+        if self.use_sharding:
+            return False
+
         if self.user_defined_strategy.pipeline == True:
             return True
         return False
 
     def _disable_strategy(self, dist_strategy):
         dist_strategy.pipeline = False
-        dist_strategy.pipeline_configs = {}
+        dist_strategy.pipeline_configs = {
+            "micro_batch_size": 1,
+            "accumulate_steps": 1,
+            "schedule_mode": "1F1B",
+        }
 
     def _enable_strategy(self, dist_strategy, context):
         dist_strategy.pipeline = True
         dist_strategy.pipeline_configs = {
             "micro_batch_size": 1,
             "accumulate_steps": 1,
+            "schedule_mode": "1F1B",
         }
 
+    def _broadcast_params(self, ring_id):
+        block = self.startup_program.global_block()
+        param = None
+        for param in block.iter_parameters():
+            if param.is_distributed:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        if not param: return  # no parameter on this device
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': param},
+            outputs={'Out': param},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+
+    def _get_process_group_info(self):
+        # global ring info
+        self.global_endpoints = self.endpoints
+        self.global_rank = self.rank
+        self.global_nranks = self.nranks
+
+        # data parallel ring info
+        if self.pipeline_num > 1:
+            self.dp_rank = self.rank // self.inner_parallelism
+            self.dp_nranks = self.nranks // self.inner_parallelism
+            start_index = self.rank % self.inner_parallelism
+            self.dp_endpoints = [
+                self.endpoints[start_index + i * self.inner_parallelism]
+                for i in range(self.pipeline_num)
+            ]
+
+    def _init_process_group(self, pipeline_pair, pipeline_ring_map):
+        self._get_process_group_info()
+        collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
+        # Create global ring for all gpus (ring_id = 0)
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.global_endpoints,
+            self.global_rank, self.global_ring_id, True, self.global_ring_id,
+            True)
+        # Create pipeline rings
+        if self.inner_parallelism > 1:
+            pipeline_id = self.rank // self.inner_parallelism
+            start_index = pipeline_id * self.inner_parallelism
+            for pair in pipeline_pair:
+                pair_key = pair[0] * 1000 + pair[1]
+                ring_id = pipeline_ring_map[pair_key]
+                assert ring_id >= self.start_pipeline_ring_id
+                first_node = pair[0] + start_index
+                second_node = pair[1] + start_index
+                if self.rank != first_node and self.rank != second_node:
+                    continue
+                pipeline_endpoints = [
+                    self.endpoints[first_node], self.endpoints[second_node]
+                ]
+                pipeline_rank = 0 if self.rank == first_node else 1
+                pipeline_nranks = 2
+                collective_helper._init_communicator(
+                    self.startup_program, self.current_endpoint,
+                    pipeline_endpoints, pipeline_rank, ring_id, False,
+                    self.global_ring_id, True)
+
+        # Create dp rings
+        if self.pipeline_num > 1:
+            collective_helper._init_communicator(
+                self.startup_program, self.current_endpoint, self.dp_endpoints,
+                self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True)
+            self._broadcast_params(self.dp_ring_id)
+
     def minimize_impl(self,
                       loss,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
-        endpoints = self.role_maker._get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker._worker_index()]
-        self.wrapped_opt = PO(self.inner_opt,
-                              num_microbatches=self.num_microbatches)
-        node_num = _get_node_num(endpoints)
-        gpus_per_node = len(endpoints) // node_num
-        self.startup_program = startup_program
-        if startup_program is None:
-            self.startup_program = fluid.default_startup_program()
-
+        self.endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
         self.rank = self.role_maker._worker_index()
         self.nranks = self.role_maker._worker_num()
-        assert self.nranks % node_num == 0
 
-        loss.block.program._pipeline_opt = dict()
-        loss.block.program._pipeline_opt['local_rank'] = self.rank
-        loss.block.program._pipeline_opt[
-            'micro_batch_size'] = self.micro_batch_size
-        optimize_ops, params_grads, prog_list = self.wrapped_opt.minimize(
+        self.wrapped_opt = PO(self.inner_opt,
+                              num_microbatches=self.num_microbatches)
+        orig_startup_program = startup_program if startup_program else fluid.default_startup_program(
+        )
+        block = loss.block
+        program = block.program
+
+        program._pipeline_opt = dict()
+        program._pipeline_opt['local_rank'] = self.rank
+        program._pipeline_opt['global_ring_id'] = self.global_ring_id
+        program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id
+        program._pipeline_opt['micro_batch_size'] = self.micro_batch_size
+        program._pipeline_opt['schedule_mode'] = self.schedule_mode
+        program._pipeline_opt['use_sharding'] = False
+        optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize(
             loss, startup_program, parameter_list, no_grad_set)
-        assert prog_list
-
-        self.main_program_list = prog_list
-        self.main_program = loss.block.program
-        self.inner_parallelism = loss.block.program._pipeline_opt[
-            'inner_parallelism']
+        self.startup_program = orig_startup_program._pipeline_opt[
+            'startup_program']
+        self.inner_parallelism = program._pipeline_opt['inner_parallelism']
         assert self.nranks % self.inner_parallelism == 0
+        assert prog_list
+        self.pipeline_num = len(self.endpoints) // self.inner_parallelism
 
-        pipeline_helper = PipelineHelper(self.role_maker)
-        pipeline_helper.update_startup_program(
-            self.startup_program._pipeline_opt["startup_program"],
-            self.inner_parallelism)
+        self._init_process_group(pp_pair, ring_map)
 
-        pipeline_num = self.nranks // self.inner_parallelism
-        self._transpile_main_program(loss, pipeline_num, self.inner_parallelism)
+        self.main_program_list = prog_list
+        self.main_program = program
+        if self.pipeline_num > 1:
+            self._transpile_main_program(loss)
         return optimize_ops, params_grads
 
-    def _transpile_main_program(self, loss, pipeline_num, inner_parallelism):
-        if pipeline_num <= 1: return
-        self._insert_loss_grad_ops(loss, pipeline_num)
-        for ring_id in range(1, inner_parallelism + 1):
-            self._insert_allreduce_ops(ring_id)
+    def _transpile_main_program(self, loss):
+        self._insert_loss_grad_ops(loss, self.pipeline_num)
+        self._insert_allreduce_ops(self.dp_ring_id)
 
     def _insert_loss_grad_ops(self, loss, pipeline_num):
         """
         In order to keep the learning rate consistent in different numbers of
         training workers, we scale the loss grad by the number of workers
         """
-        block = self.main_program_list[-1]['program'].global_block()
+        block = self.main_program_list[-1].global_block()
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
@@ -237,57 +221,43 @@ class PipelineOptimizer(MetaOptimizerBase):
                     })
 
     def _insert_allreduce_ops(self, ring_id):
-        block = self.main_program_list[ring_id - 1]['program'].global_block()
+        block = self.main_program._pipeline_opt['section_program'].global_block(
+        )
         origin_block = self.main_program.global_block()
         grad = None
         processed_param_name = set()
+        first_optimize_op_idx = None
         for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and not first_optimize_op_idx:
+                first_optimize_op_idx = idx + 1
+                # no optimize phase
+                if first_optimize_op_idx == len(block.ops): return
             if is_backward_op(op) and \
                     OP_ROLE_VAR_KEY in op.attr_names:
                 op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
                 if len(op_role_var) == 0:
                     continue
                 assert len(op_role_var) % 2 == 0
-                offset = idx
+                offset = 0
                 for i in range(0, len(op_role_var), 2):
                     param_name = op_role_var[i]
                     param = block.vars[op_role_var[i]]
                     if param_name in processed_param_name: continue
                     processed_param_name.add(param_name)
-                    grad = block.vars[op_role_var[i + 1]]
+                    grad_name = op_role_var[i + 1]
+                    if not 'MERGED' in grad_name: grad_name += '@MERGED'
+                    grad = block.vars[grad_name]
                     origin_param = origin_block.vars[op_role_var[i]]
                     if origin_param.is_distributed:
                         continue
-                    if offset == idx:
-                        offset += 1
-                        block._insert_op(
-                            offset,
-                            type='c_sync_calc_stream',
-                            inputs={'X': grad},
-                            outputs={'Out': grad},
-                            attrs={OP_ROLE_KEY: OpRole.Backward})
-                        offset += 1
 
                     block._insert_op(
-                        offset,
+                        first_optimize_op_idx + offset,
                         type='c_allreduce_sum',
                         inputs={'X': grad},
                         outputs={'Out': grad},
                         attrs={
                             'ring_id': ring_id,
-                            OP_ROLE_KEY: OpRole.Backward
+                            'use_calc_stream': True,
+                            OP_ROLE_KEY: OpRole.Optimize
                         })
-
-        if grad is None:
-            return
-
-        for idx, op in enumerate(block.ops):
-            if is_optimizer_op(op):
-                block._insert_op(
-                    idx,
-                    type='c_sync_comm_stream',
-                    inputs={'X': grad},
-                    outputs={'Out': grad},
-                    attrs={'ring_id': ring_id,
-                           OP_ROLE_KEY: OpRole.Backward})
-            break
diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
new file mode 100755
index 0000000000000000000000000000000000000000..243f6efe53185d504832ff3e5cd89b5322fc53e0
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -0,0 +1,196 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from __future__ import print_function
+from __future__ import division
+import os
+
+import paddle.fluid as fluid
+from paddle.fluid import core, unique_name
+from ..base.private_helper_function import wait_server_ready
+from .meta_optimizer_base import MetaOptimizerBase
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
+
+
+class RawProgramOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(RawProgramOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = [
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.global_ring_id = 0
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(RawProgramOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        self.without_graph_optimization = user_defined_strategy.without_graph_optimization
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if self.without_graph_optimization == True:
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.without_graph_optimization = False
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.without_graph_optimization = True
+
+    def _broadcast_params(self, ring_id):
+        block = self.startup_program.global_block()
+        param = None
+        for param in block.iter_parameters():
+            if param.is_distributed:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        if not param: return  # no parameter on this device
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': param},
+            outputs={'Out': param},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+
+    def _get_process_group_info(self):
+        # global ring info
+        self.global_endpoints = self.endpoints
+        self.global_rank = self.rank
+        self.global_nranks = self.nranks
+
+    def _init_process_group(self):
+        self._get_process_group_info()
+        collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
+        # Create global ring for all gpus (ring_id = 0)
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.global_endpoints,
+            self.global_rank, self.global_ring_id, True, self.global_ring_id,
+            True)
+        self._broadcast_params(self.global_ring_id)
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        self.endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
+        self.rank = self.role_maker._worker_index()
+        self.nranks = self.role_maker._worker_num()
+        if startup_program is None:
+            startup_program = fluid.default_startup_program()
+        self.startup_program = startup_program
+
+        block = loss.block
+        program = block.program
+        self.main_program = program
+
+        optimize_ops, params_grads = self.inner_opt.minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+
+        self._init_process_group()
+
+        self.main_program = program
+        if self.nranks > 1:
+            self._transpile_main_program(loss)
+        return optimize_ops, params_grads
+
+    def _transpile_main_program(self, loss):
+        self._insert_loss_grad_ops(loss)
+        self._insert_allreduce_ops()
+
+    def _insert_loss_grad_ops(self, loss):
+        """
+        In order to keep the learning rate consistent in different numbers of
+        training workers, we scale the loss grad by the number of workers
+        """
+        block = self.main_program.global_block()
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_loss_grad_op(op):
+                loss_grad_var = block.vars[op.output_arg_names[0]]
+                block._insert_op(
+                    idx + 1,
+                    type='scale',
+                    inputs={'X': loss_grad_var},
+                    outputs={'Out': loss_grad_var},
+                    attrs={
+                        'scale': 1.0 / self.nranks,
+                        OP_ROLE_KEY: OpRole.Backward
+                    })
+
+    def _insert_allreduce_ops(self):
+        block = self.main_program.global_block()
+        ring_id = self.global_ring_id
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and \
+                    OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.attr(OP_ROLE_VAR_KEY)
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                offset = 1
+                for i in range(0, len(op_role_var), 2):
+                    param_name = op_role_var[i]
+                    param = block.var(param_name)
+                    grad_name = op_role_var[i + 1]
+                    grad = block.var(grad_name)
+                    if param.is_distributed:
+                        continue
+
+                    block._insert_op(
+                        idx + offset,
+                        type='c_sync_calc_stream',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={OP_ROLE_KEY: OpRole.Backward, })
+                    offset += 1
+                    block._insert_op(
+                        idx + offset,
+                        type='c_allreduce_sum',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+        for idx, op in enumerate(block.ops):
+            if is_optimizer_op(op):
+                block._insert_op(
+                    idx,
+                    type='c_sync_comm_stream',
+                    inputs={'X': grad},
+                    outputs={'Out': grad},
+                    attrs={'ring_id': ring_id,
+                           OP_ROLE_KEY: OpRole.Backward})
+                break
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 3a784c306257b20929ed0bc1e080b104a638b928..d79675448c0425390701ba42abadd6dcb3c4c555 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -14,6 +14,8 @@
 from paddle.fluid.optimizer import RecomputeOptimizer as RO
 from .meta_optimizer_base import MetaOptimizerBase
 
+__all__ = []
+
 
 class RecomputeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
old mode 100644
new mode 100755
index 03b36262a4fb1e095eb17fa57bf27b5c9f3cf74c..8e6363537298459bd930e82fd18c662522171696
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -17,6 +17,8 @@ from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
 
 from paddle.fluid import core
 
+__all__ = []
+
 
 class FP16Utils(object):
     def __init__(self):
@@ -73,7 +75,7 @@ class FP16Utils(object):
     @staticmethod
     def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
         """
-        1. prune all cast_fp32_to_fp16 ops if the param not belongs to this shard
+        1. prune all cast_fp16_to_fp32 ops if the param not belongs to this shard
         2. revise amp inifine grad checking for sharding
         """
         # remove cast
@@ -81,7 +83,10 @@ class FP16Utils(object):
             if not FP16Utils.is_fp32_cast_op(block, op):
                 continue
             output_name = op.desc.output_arg_names()[0]
-            param_name = output_name.strip("@GRAD")
+            # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+            param_name = output_name.strip(
+                "@GRAD@MERGED"
+            ) if "@MERGED" in output_name else output_name.strip("@GRAD")
             if param_name not in shard.global_params:
                 raise ValueError("Output 'X' of cast_op must be a grad of"
                                  "model param, but {} is not a grad".format(
@@ -103,20 +108,37 @@ class FP16Utils(object):
                 op._rename_input(inf_var_name, inf_var_name + "@sharding")
             if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
                 reversed_x = []
+                reversed_x_paramname = []
                 for input_name in op.desc.input('X'):
-                    param_name = input_name.strip("@GRAD")
+                    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+                    if "@MERGED" in input_name:
+                        param_name = input_name.strip("@GRAD@MERGED")
+                    else:
+                        param_name = input_name.strip("@GRAD")
                     if param_name not in shard.global_params:
                         raise ValueError(
                             "Input 'X' of check_finite_and_unscale must"
                             "be grads, but {} is not a grad".format(input_name))
                     if shard.has_param(param_name):
                         reversed_x.append(input_name)
+                        reversed_x_paramname.append(param_name)
                 op.desc.set_input('X', reversed_x)
                 op.desc.set_output('Out', reversed_x)
+
+                # the grad checking should take the all and only param in the current shard
+                to_check_param = set(reversed_x_paramname)
+                should_check_param = set(shard.global_params).intersection(
+                    set([param for param, worker_idx in shard.global_param2device.items() \
+                        if worker_idx == shard.worker_idx]))
+                assert to_check_param == should_check_param, "amp \
+                    check_finite_and_unscale checking miss [{}] and got unexpected [{}]".format(
+                    should_check_param - to_check_param,
+                    to_check_param - should_check_param)
+
         if update_loss_scaling_op_idx == -1:
             return
         inf_var = block.var(inf_var_name)
-        inf_var_fp32 = block.create_var(
+        inf_var_int32 = block.create_var(
             name=inf_var_name + "@cast_int32",
             shape=inf_var.shape,
             dtype=core.VarDesc.VarType.INT32)
@@ -128,33 +150,86 @@ class FP16Utils(object):
             update_loss_scaling_op_idx,
             type='cast',
             inputs={'X': inf_var},
-            outputs={'Out': inf_var_fp32},
+            outputs={'Out': inf_var_int32},
             attrs={
                 "in_dtype": inf_var.dtype,
-                "out_dtype": inf_var_fp32.dtype,
+                "out_dtype": inf_var_int32.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
-        insert_sync_calc_op(block, update_loss_scaling_op_idx + 1,
-                            [inf_var_fp32])
+        # this allreduce communication should not overlap with calc
         block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 2,
+            update_loss_scaling_op_idx + 1,
             type='c_allreduce_max',
-            inputs={'X': inf_var_fp32},
-            outputs={'Out': inf_var_fp32},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Optimize})
-
-        comm_op_num = insert_sync_comm_op(block, update_loss_scaling_op_idx + 3,
-                                          ring_id, [inf_var_fp32])
-
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
         block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 3 + comm_op_num,
+            update_loss_scaling_op_idx + 2,
             type='cast',
-            inputs={'X': inf_var_fp32},
+            inputs={'X': inf_var_int32},
             outputs={'Out': inf_var_sharding},
             attrs={
-                "in_dtype": inf_var_fp32.dtype,
+                "in_dtype": inf_var_int32.dtype,
                 "out_dtype": inf_var_sharding.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
         block._sync_with_cpp()
+
+    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+    @staticmethod
+    def sync_amp_check_nan_inf(block, ring_id):
+        update_loss_scaling_op_idx = -1
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type == "update_loss_scaling":
+                update_loss_scaling_op_idx = idx
+                inf_var_name = op.desc.input('FoundInfinite')[0]
+                op._rename_input(inf_var_name, inf_var_name + "@GLOBAL_WORLD")
+
+        # not use amp
+        if update_loss_scaling_op_idx == -1:
+            return
+        inf_var = block.var(inf_var_name)
+        inf_var_int32 = block.create_var(
+            name=inf_var_name + "@cast_int32",
+            shape=inf_var.shape,
+            dtype=core.VarDesc.VarType.INT32)
+        inf_var_global = block.create_var(
+            name=inf_var_name + "@GLOBAL_WORLD",
+            shape=inf_var.shape,
+            dtype=inf_var.dtype)
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx,
+            type='cast',
+            inputs={'X': inf_var},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                "in_dtype": inf_var.dtype,
+                "out_dtype": inf_var_int32.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx + 1,
+            type='c_allreduce_max',
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx + 2,
+            type='cast',
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_global},
+            attrs={
+                "in_dtype": inf_var_int32.dtype,
+                "out_dtype": inf_var_global.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
old mode 100644
new mode 100755
index c6aee792fcf745a6ec51b3c4d1945415bfd9324f..fd74f28b69e19000fa3f59b973ae165f8dd38abb
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
@@ -14,16 +14,18 @@
 
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 
+__all__ = []
+
 
 class GradientClipHelper(object):
-    def __init__(self, sharding_ring_id):
-        self.sharding_ring_id = sharding_ring_id
+    def __init__(self, mp_ring_id):
+        self.mp_ring_id = mp_ring_id
 
     def _is_gradient_clip_op(self, op):
         return op.desc.has_attr("op_namescope") \
             and op.desc.attr("op_namescope").startswith("/gradient_clip")
 
-    def prune_gradient_clip(self, block, shard):
+    def prune_gradient_clip(self, block, shard, pure_dp_degree=1):
         """
         prune gradient_clip related ops for params that not belong to cur shard
         prune: square, reduce_sum, elementwise_mul
@@ -31,6 +33,8 @@ class GradientClipHelper(object):
         """
         deperated_vars = set()
         deperate_op_idx = set()
+        reversed_x_paramname = []
+        global_norm_sum_op_idx = -1
         for idx, op in enumerate(block.ops):
             if not self._is_gradient_clip_op(op):
                 continue
@@ -40,15 +44,22 @@ class GradientClipHelper(object):
             for input_name in op.desc.input_arg_names():
                 if input_name in deperated_vars:
                     deperate_op = True
-                param_name = input_name.strip("@GRAD")
+                # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+                if "@MERGED" in input_name:
+                    param_name = input_name.strip("@GRAD@MERGED")
+                else:
+                    param_name = input_name.strip("@GRAD")
                 if shard.is_param(param_name) and \
                   not shard.has_param(param_name):
                     deperate_op = True
+                elif shard.is_param(param_name):
+                    reversed_x_paramname.append(param_name)
 
             if deperate_op:
                 deperate_op_idx.add(idx)
                 for output_name in op.desc.output_arg_names():
-                    deperated_vars.add(output_name)
+                    if output_name not in op.desc.input_arg_names():
+                        deperated_vars.add(output_name)
 
         if not deperated_vars:
             # got no gradient_clip op
@@ -62,36 +73,96 @@ class GradientClipHelper(object):
                 continue
             reversed_inputs = []
             if op.type == "sum":
+                global_norm_sum_op_idx = idx
                 for input_name in op.desc.input_arg_names():
                     if input_name not in deperated_vars:
                         reversed_inputs.append(input_name)
+
                 op.desc.set_input("X", reversed_inputs)
                 assert (len(op.desc.output_arg_names()) == 1)
                 sum_res = op.desc.output_arg_names()[0]
-                block._insert_op_without_sync(
-                    idx + 1,
-                    type='c_sync_comm_stream',
-                    inputs={'X': sum_res},
-                    outputs={'Out': sum_res},
-                    attrs={'ring_id': 0,
-                           OP_ROLE_KEY: OpRole.Optimize})
+
+                # this allreduce should not overlap with calc and should be scheduled in calc stream
                 block._insert_op_without_sync(
                     idx + 1,
                     type='c_allreduce_sum',
                     inputs={'X': sum_res},
                     outputs={'Out': sum_res},
                     attrs={
-                        'ring_id': self.sharding_ring_id,
-                        OP_ROLE_KEY: OpRole.Optimize
+                        'ring_id': self.mp_ring_id,
+                        'op_namescope': "/gradient_clip_model_parallelism",
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize,
                     })
+
+                # global norm should only be sum within each model parallelism word size when use global group
+                if pure_dp_degree > 1:
+                    block._insert_op_without_sync(
+                        idx + 2,
+                        type='scale',
+                        inputs={'X': sum_res},
+                        outputs={'Out': sum_res},
+                        attrs={
+                            'scale': 1.0 / float(pure_dp_degree),
+                            'op_namescope': "/gradient_clip_model_parallelism",
+                            'bias': 0.0,
+                            'bias_after_scale': False,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+
+        # the grad sum here should take the all and only param in the current shard
+        to_check_param = set(reversed_x_paramname)
+        should_check_param = set(shard.global_params).intersection(set(
+            [param for param, worker_idx in shard.global_param2device.items() \
+                if worker_idx == shard.worker_idx]))
+        assert to_check_param == should_check_param, "amp check_finite_and_unscale \
+        checking miss [{}] and got unexpected [{}]".format(
+            should_check_param - to_check_param,
+            to_check_param - should_check_param)
+
+        for var_name in deperated_vars:
+            block._remove_var(var_name, sync=False)
+        block._sync_with_cpp()
+        return
+
+    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+    def sync_global_norm(self, block, ring_id, pure_dp_degree=1):
+        """
+        prune gradient_clip related ops for params that not belong to cur shard
+        prune: square, reduce_sum, elementwise_mul
+        keep: sum, sqrt, elementwise_max, elementwise_div
+        """
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if not self._is_gradient_clip_op(op):
+                continue
+
+            if op.type == "sum":
+                sum_res = op.desc.output_arg_names()[0]
                 block._insert_op_without_sync(
                     idx + 1,
-                    type='c_sync_calc_stream',
+                    type='c_allreduce_sum',
                     inputs={'X': sum_res},
                     outputs={'Out': sum_res},
-                    attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    attrs={
+                        'ring_id': ring_id,
+                        'op_namescope': "/gradient_clip_model_parallelism",
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize,
+                    })
+
+                # global norm should only be sum within each model parallelism word size
+                if pure_dp_degree > 1:
+                    block._insert_op_without_sync(
+                        idx + 2,
+                        type='scale',
+                        inputs={'X': sum_res},
+                        outputs={'Out': sum_res},
+                        attrs={
+                            'scale': 1.0 / float(pure_dp_degree),
+                            'op_namescope': "/gradient_clip_model_parallelism",
+                            'bias': 0.0,
+                            'bias_after_scale': False,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
 
-        for var_name in deperated_vars:
-            block._remove_var(var_name, sync=False)
-        block._sync_with_cpp()
         return
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
new file mode 100755
index 0000000000000000000000000000000000000000..f6741b165ce07280aea7a95acebc13ac0a291704
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole
+from paddle.fluid import core, unique_name
+
+__all__ = []
+
+
+class OffloadHelper(object):
+    cpu_place_type = 0
+    cuda_place_type = 1
+    cuda_pinned_place_type = 2
+
+    def __init__(self):
+        pass
+        "0: dst is on CPUPlace. "
+        "1: dst is on CUDAPlace. "
+        "2: dst is on CUDAPinnedPlace. "
+
+    def _insert_cast_op(self, block, idx, src_name, dst_name):
+        src_var = block.var(src_name)
+        if not block.has_var(dst_name):
+            block.create_var(
+                name=dst_name,
+                shape=src_var.shape,
+                dtype=core.VarDesc.VarType.FP16,
+                persistable=True)
+        dst_var = block.var(dst_name)
+        assert dst_var.dtype == core.VarDesc.VarType.FP16
+        block._insert_op_without_sync(
+            idx,
+            type='cast',
+            inputs={'X': src_var},
+            outputs={'Out': dst_var},
+            attrs={
+                'in_dtype': src_var.dtype,
+                'out_dtype': dst_var.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+
+    def _insert_memcpy_op(self, block, idx, src_name, dst_name, dst_place_type):
+        src_var = block.var(src_name)
+        dst_var = block.var(dst_name)
+        block._insert_op_without_sync(
+            idx,
+            type='memcpy',
+            inputs={'X': src_var},
+            outputs={'Out': dst_var},
+            attrs={
+                'dst_place_type': dst_place_type,
+                OP_ROLE_KEY: OpRole.Optimize,
+            })
+
+    def _insert_fetch_op(self, block, idx, src_name, dst_name):
+        self._insert_memcpy_op(block, idx, src_name, dst_name,
+                               OffloadHelper.cuda_place_type)
+
+    def _insert_offload_op(self, block, idx, src_name, dst_name):
+        self._insert_memcpy_op(block, idx, src_name, dst_name,
+                               OffloadHelper.cuda_pinned_place_type)
+
+    def _get_offload_var_name(self, name):
+        return unique_name.generate(name + '@offload')
+
+    def _create_offload_var(self, var_name, offload_var_name, blocks):
+        for block in blocks:
+            var = block.var(var_name)
+            var.persistable = False
+            offload_var = block.create_var(
+                name=offload_var_name,
+                shape=var.shape,
+                dtype=var.dtype,
+                persistable=True)
+
+    def offload_fp32param(self, block, startup_block):
+        """
+        (p_fp16) = cast(p)
+        (p_fp16_recompute) = cast(p)
+        (pout,) = adam(p)
+        ===========================>
+        rename(p_fp16_recompute, p_fp16)
+
+        (p,) = prefetch(p@offload)
+        (pout,) = adam(p)
+        (p_fp16) = cast(p)
+        (p@offload) = memcpy(p)
+        """
+        param_to_idx = dict()
+        param_to_fp16 = dict()
+        # recompute_var which need rename to fp16_param
+        fp16_param_to_recompute = dict()
+        recompute_to_fp16 = dict()
+
+        def remove_param(input_name):
+            param_to_idx.pop(input_name)
+            if input_name in param_to_fp16:
+                fp16_param = param_to_fp16.pop(input_name)
+                if fp16_param in fp16_param_to_recompute:
+                    recompute = fp16_param_to_recompute.pop(fp16_param)
+                    recompute_to_fp16.pop(recompute)
+
+        # step1: record param
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type in ('adam', 'momentum', 'lars', 'lamb'):
+                param = op.desc.input("Param")[0]
+                param_to_idx[param] = idx
+
+        # step2: remove param which can't offload
+        for idx, op in enumerate(block.ops):
+            if is_optimizer_op(op):
+                break
+            for input_name in op.desc.input_arg_names():
+                if input_name not in param_to_idx:
+                    continue
+
+                # param is real used by fp32 op
+                if op.type != 'cast':
+                    remove_param(input_name)
+                    continue
+
+                # param is only used by cast op,
+                # which to cast fp32_param to fp16_param
+                output_name = op.output_arg_names[0]
+                if 'cast_fp16' not in output_name:
+                    remove_param(input_name)
+                    continue
+
+                if 'subprog' not in output_name:
+                    assert output_name == input_name + '.cast_fp16'
+                    assert input_name not in param_to_fp16, \
+                        "There must be only one cast op from fp32 param to fp16 param."
+                    param_to_fp16[input_name] = output_name
+                else:
+                    # fp16-->recompute_var
+                    assert input_name in param_to_fp16, \
+                        "param must first be cast to fp16"
+                    fp16_param = param_to_fp16[input_name]
+                    fp16_param_to_recompute[fp16_param] = output_name
+                    recompute_to_fp16[output_name] = fp16_param
+
+        param_name_to_offload_name = dict()
+        # step3: main_block add offload, cast op
+        # change recompute to fp16, remove cast(param) to fp16
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type in ('adam', 'momentum', 'lars', 'lamb'):
+                param = op.desc.input("Param")[0]
+                if param not in param_to_idx: continue
+                # step3.1: create offload_var
+                offload_var_name = self._get_offload_var_name(param)
+                param_name_to_offload_name[param] = offload_var_name
+                self._create_offload_var(param, offload_var_name,
+                                         [block, startup_block])
+
+                # step3.2: insert cast op and offload op
+                self._insert_offload_op(block, idx + 1, param, offload_var_name)
+
+                assert param in param_to_fp16
+                fp16_param_name = param_to_fp16[param]
+                fp16_param_var = block.var(fp16_param_name)
+                fp16_param_var.persistable = True
+                self._insert_cast_op(block, idx + 1, param,
+                                     param_to_fp16[param])
+
+                # step3.3: insert fetch op
+                self._insert_fetch_op(block, idx, offload_var_name, param)
+                continue
+
+            # step3.4: remove cast op
+            if op.type == 'cast':
+                input_name = op.desc.input_arg_names()[0]
+                if input_name in param_to_idx:
+                    block._remove_op(idx, sync=False)
+                    continue
+
+            # step3.5: change recompute_param to fp16_param
+            for input_name in op.desc.input_arg_names():
+                if input_name in recompute_to_fp16:
+                    op._rename_input(input_name, recompute_to_fp16[input_name])
+            for output_name in op.desc.output_arg_names():
+                if output_name in recompute_to_fp16:
+                    op._rename_output(output_name,
+                                      recompute_to_fp16[output_name])
+
+        # step4: remove recompute_param
+        for name in recompute_to_fp16.keys():
+            block._remove_var(name, sync=False)
+
+        # step5: startup_block add offload
+        visited_vars = set()
+        for idx, op in reversed(list(enumerate(startup_block.ops))):
+            for out_name in op.output_arg_names:
+                if out_name in visited_vars:
+                    continue
+
+                if out_name in param_name_to_offload_name:
+                    var_name = out_name
+                    offload_var_name = param_name_to_offload_name[var_name]
+                    self._insert_offload_op(startup_block, idx + 1, var_name,
+                                            offload_var_name)
+                    self._insert_cast_op(startup_block, idx + 1, var_name,
+                                         param_to_fp16[var_name])
+
+                visited_vars.add(out_name)
+
+        block._sync_with_cpp()
+        startup_block._sync_with_cpp()
+
+    def offload(self, block, startup_block):
+        """
+        (m1, m2) = prefetch(m1@offload, m2@offload)
+        (m1out, m2out, pout) = adam(m1, m2, p)
+        (m1@offload, m2@offload) = memcpy(m1, m2)
+        """
+        vars_name_to_offload_name = dict()
+
+        # main_block add offload
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if not is_optimizer_op(op):
+                break
+
+            vars_name = []
+            if op.type == "adam":
+                # {Moment1Out = [''], Moment2Out = [''], ParamOut = ['']} =
+                # adam(inputs={Moment1 = [''], Moment2 = [''], Param = ['']})
+                vars_name.append(op.desc.input("Moment1")[0])
+                vars_name.append(op.desc.input("Moment2")[0])
+            elif op.type == 'momentum':
+                pass
+            elif op.type == 'lars':
+                pass
+            elif op.type == 'lamb':
+                pass
+
+            # step1: create and init offload_var
+            for var_name in vars_name:
+                assert var_name not in vars_name_to_offload_name
+
+                offload_var_name = self._get_offload_var_name(var_name)
+                vars_name_to_offload_name[var_name] = offload_var_name
+
+                self._create_offload_var(var_name, offload_var_name,
+                                         [block, startup_block])
+
+            # step2: insert offload op
+            for var_name in vars_name:
+                offload_var_name = vars_name_to_offload_name[var_name]
+                self._insert_offload_op(block, idx + 1, var_name,
+                                        offload_var_name)
+
+            # step3: insert fetch op
+            for var_name in vars_name:
+                offload_var_name = vars_name_to_offload_name[var_name]
+                self._insert_fetch_op(block, idx, offload_var_name, var_name)
+
+        # startup_block add offload
+        visited_vars = set()
+        for idx, op in reversed(list(enumerate(startup_block.ops))):
+            for out_name in op.output_arg_names:
+                if out_name in visited_vars:
+                    continue
+
+                if out_name in vars_name_to_offload_name:
+                    var_name = out_name
+                    offload_var_name = vars_name_to_offload_name[var_name]
+                    # insert offload op after var is generated
+                    self._insert_offload_op(startup_block, idx + 1, var_name,
+                                            offload_var_name)
+                visited_vars.add(out_name)
+
+        block._sync_with_cpp()
+        startup_block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
old mode 100644
new mode 100755
index 70753b59ccc318a25661e084bd305d7d76b0e2a6..dd4e16b576fcf00420b87458427f4e5b0b34ec75
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+__all__ = []
+
 
 class ProgramDeps(object):
     def __init__(self, block, start_vars, end_vars):
@@ -126,6 +128,10 @@ class ProgramDeps(object):
 
     def should_remove_op(self, op_idx):
         op = self._block.ops[op_idx]
+        # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+        # remove check_finite_and_unscale op if its input 'X' is empty
+        if op.type == 'check_finite_and_unscale' and len(op.input('X')) == 0:
+            return True
         for output_name in op.desc.output_arg_names():
             if output_name not in self._should_removed_var:
                 return False
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
index 92e36e0ec1fff352cbb88eaea7024200414c4389..0c33a78120cb84b3203c838982eefda7492d8d74 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
@@ -16,6 +16,8 @@ from paddle.distributed.fleet.meta_optimizers.common import is_optimizer_op
 from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
 from paddle.distributed.fleet.meta_optimizers.sharding.fp16_helper import FP16Utils
 
+__all__ = []
+
 
 class Shard(object):
     def __init__(self, ):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index ad1cd4f60826bbf434294114d1982cb4beb3f00a..f4ceb2d287a56c7b955817263f751e32dbf23e77 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -28,21 +28,24 @@ def check_broadcast(block):
     if the broadcasted var has a fill_constant op, the fill_constant
     op should stay forward before the broadcast op, and before a
     sync_calc op. Otherwise, raise error.
+
+    should ignore and skip broadcast_op of inner_parallelism (e.g. Megatron)
     """
     broadcast_vars = {}
     for idx, op in enumerate(block.ops):
         if op.type == "c_broadcast":
-            var_name = op.desc.input_arg_names()[0]
-            if "@BroadCast" in var_name:
-                if var_name in broadcast_vars:
-                    raise ValueError("var_name areadly exist: {}"
-                                     "the old pos is {}, the new pos is {}".
-                                     format(var_name, broadcast_vars[var_name][
-                                         "broadcast_pos"], idx))
-                broadcast_vars[var_name] = {
-                    "fill_constant_pos": -1,
-                    "broadcast_pos": idx,
-                }
+            if op.all_attrs()["use_calc_stream"] == False:
+                var_name = op.desc.input_arg_names()[0]
+                if "@BroadCast" in var_name:
+                    if var_name in broadcast_vars:
+                        raise ValueError("var_name areadly exist: {}"
+                                         "the old pos is {}, the new pos is {}".
+                                         format(var_name, broadcast_vars[
+                                             var_name]["broadcast_pos"], idx))
+                    broadcast_vars[var_name] = {
+                        "fill_constant_pos": -1,
+                        "broadcast_pos": idx,
+                    }
 
     for idx, op in enumerate(block.ops):
         if op.type == "fill_constant":
@@ -61,14 +64,15 @@ def check_broadcast(block):
             last_sync_calc_op_idx = idx
             continue
         if op.type == "c_broadcast":
-            var_name = op.desc.input_arg_names()[0]
-            if "@BroadCast" in var_name:
-                if broadcast_vars[var_name]["fill_constant_pos"] != -1:
-                    assert (last_sync_calc_op_idx != -1)
-                    assert (broadcast_vars[var_name]["fill_constant_pos"] <
-                            last_sync_calc_op_idx)
-                    assert (last_sync_calc_op_idx < idx)
-                continue
+            if op.all_attrs()["use_calc_stream"] == False:
+                var_name = op.desc.input_arg_names()[0]
+                if "@BroadCast" in var_name:
+                    if broadcast_vars[var_name]["fill_constant_pos"] != -1:
+                        assert (last_sync_calc_op_idx != -1)
+                        assert (broadcast_vars[var_name]["fill_constant_pos"] <
+                                last_sync_calc_op_idx)
+                        assert (last_sync_calc_op_idx < idx)
+                    continue
         for input_name in op.desc.input_arg_names():
             if input_name in broadcast_vars:
                 assert (broadcast_vars[input_name]["broadcast_pos"] != -1)
@@ -78,43 +82,48 @@ def check_broadcast(block):
     return
 
 
-def check_allreduce_sum(block, shard, dp_ring_id=-1):
+def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
     """
     the op order should be:
         grad:
             - 0: op that generate Var
             - 1: sync_calc
-            - 2: allreduce_sum_sharding
+            - 2: reduce_sum_sharding (allreduce --> reduce)
             - 3: sync_comm
             - 4: allreuce_sum_dp (dp_grads)
             - 5: sync_comm (dp_grads)
             - 6: op that use Var (dp_grads & sum)
+
+    should ignore and skip allreduce_op of inner_parallelism (e.g. Megatron)
     """
     vars_status = {}
     dp_grads_status = {}
     idx_last_grad_allreduce = -1
     idx_amp_allreduce = -1
     idx_gradient_clip_allreduce = -1
+
     for idx, op in enumerate(block.ops):
-        if op.type == "c_allreduce_sum":
-            ring_id = op.desc.attr("ring_id")
-            var_name = op.desc.input_arg_names()[0]
-            param = var_name.split("@")[0]
+        # sharding use both allreduce and reduce to sync grad
+        if op.type == "c_allreduce_sum" or op.type == "c_reduce_sum":
+            if op.all_attrs()["use_calc_stream"] == False:
+                ring_id = op.desc.attr("ring_id")
+                var_name = op.desc.input_arg_names()[0]
+                param = var_name.split("@")[0]
 
-            assert 'sum' in var_name or ("@GRAD" in var_name)
-            if 'sum' in var_name or (not shard.has_param(param)):
-                vars_status[var_name] = -1
-            else:
-                dp_grads_status[var_name] = -1
+                assert 'sum' in var_name or ("@GRAD" in var_name)
+                if 'sum' in var_name or (not shard.has_param(param)):
+                    vars_status[var_name] = -1
+                else:
+                    dp_grads_status[var_name] = -1
 
-            if ring_id != 0:
-                assert shard.has_param(param)
-                assert ring_id == dp_ring_id
+                if ring_id != sharding_ring_id:
+                    assert shard.has_param(param)
+                    assert ring_id == dp_ring_id
 
-            if "sum" in var_name:
-                idx_amp_allreduce = idx
-            elif "@GRAD":
-                idx_last_grad_allreduce = idx
+                if "sum" in var_name:
+                    idx_amp_allreduce = idx
+                elif "@GRAD":
+                    idx_last_grad_allreduce = idx
 
         if op.type == "c_allreduce_max":
             idx_gradient_clip_allreduce = idx
@@ -128,38 +137,41 @@ def check_allreduce_sum(block, shard, dp_ring_id=-1):
                 if var_name in dp_grads_status and dp_grads_status[
                         var_name] == 0:
                     dp_grads_status[var_name] = 1
-
-        elif op.type == "c_allreduce_sum":
-            var_name = op.desc.input_arg_names()[0]
-            ring_id = op.desc.attr("ring_id")
-            if ring_id == 0:
-                if var_name in vars_status:
-                    _status = vars_status[var_name]
-                else:
-                    _status = dp_grads_status[var_name]
-                if _status == -1:
-                    raise ValueError("{} is not generated, but you are"
-                                     "trying to all-reduce it".format(var_name))
-                if _status == 0:
-                    raise ValueError("There should be a sync_calc op "
-                                     "after generate Var: {} and before the"
-                                     "c_allreduce_sum op".format(var_name))
-                assert (_status == 1)
-                if var_name in vars_status:
-                    vars_status[var_name] = 2
+        # check sharding allreduce and  reduce but skip megatron allreduce
+        elif op.type == "c_allreduce_sum" or op.type == "c_reduce_sum":
+            if op.all_attrs()["use_calc_stream"] == False:
+                var_name = op.desc.input_arg_names()[0]
+                ring_id = op.desc.attr("ring_id")
+                if ring_id == sharding_ring_id:
+                    assert op.type == "c_reduce_sum", "Grad in Sharding group should be reduce rather than allreduce"
+                    if var_name in vars_status:
+                        _status = vars_status[var_name]
+                    else:
+                        _status = dp_grads_status[var_name]
+                    if _status == -1:
+                        raise ValueError("{} is not generated, but you are"
+                                         "trying to all-reduce it".format(
+                                             var_name))
+                    if _status == 0:
+                        raise ValueError("There should be a sync_calc op "
+                                         "after generate Var: {} and before the"
+                                         "c_allreduce_sum op".format(var_name))
+                    assert (_status == 1)
+                    if var_name in vars_status:
+                        vars_status[var_name] = 2
+                    else:
+                        dp_grads_status[var_name] = 2
                 else:
-                    dp_grads_status[var_name] = 2
-            else:
-                assert ring_id == dp_ring_id
-                param = var_name.split("@")[0]
-                assert shard.has_param(param)
-                assert dp_grads_status[var_name] == 3
-                dp_grads_status[var_name] = 4
+                    assert ring_id == dp_ring_id
+                    param = var_name.split("@")[0]
+                    assert shard.has_param(param)
+                    assert dp_grads_status[var_name] == 3
+                    dp_grads_status[var_name] = 4
 
         elif op.type == "c_sync_comm_stream":
             var_name = op.desc.input_arg_names()[0]
             ring_id = op.desc.attr("ring_id")
-            if ring_id == 0:
+            if ring_id == sharding_ring_id:
                 for var_name in op.desc.input_arg_names():
                     if var_name in vars_status:
                         assert vars_status[var_name] == 2
@@ -181,6 +193,9 @@ def check_allreduce_sum(block, shard, dp_ring_id=-1):
                         raise ValueError("There should be a sync_comm op "
                                          "after allreduce the Var: {}".format(
                                              input_name))
+                    raise ValueError(
+                        "The reduce output grad [{}] should NOT be be used in Non-root rank.".
+                        format(input_name))
                 if input_name in dp_grads_status:
                     if dp_ring_id == -1:
                         if dp_grads_status[input_name] != 3:
@@ -259,6 +274,10 @@ def insert_sync_comm_ops(block, insert_idx, ring_id, comm_dep_vars):
     """
     insert sync_comm_op for vars
     """
+    # NOTE (JZ-LIANG) to be check, may result undefined case 
+    if len(comm_dep_vars) == 0:
+        return 0
+
     op_role = get_valid_op_role(block, insert_idx)
     block._insert_op_without_sync(
         insert_idx,
@@ -309,22 +328,89 @@ def insert_cast_ops(block, insert_idx, cast_ops):
     return
 
 
-def insert_allreduce_ops(block, insert_idx, ring_id, allreduce_vars):
+def insert_allreduce_ops(block,
+                         insert_idx,
+                         ring_id,
+                         allreduce_vars,
+                         op_role=OpRole.Backward,
+                         use_calc_stream=False):
     """
     _add_allreduce_ops
     """
+    if len(allreduce_vars) == 0:
+        return
+
     for var in allreduce_vars:
         block._insert_op_without_sync(
             insert_idx,
             type='c_allreduce_sum',
             inputs={'X': var},
             outputs={'Out': var},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Backward})
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+                OP_ROLE_KEY: op_role
+            })
+
+    return
+
+
+def insert_reduce_ops(block,
+                      insert_idx,
+                      ring_id,
+                      reduce_vars,
+                      shard,
+                      op_role=OpRole.Backward,
+                      use_calc_stream=False):
+    """
+    _add_allreduce_ops
+    """
+    for var in reduce_vars:
 
+        root_id = get_grad_device(var, shard)
+        assert root_id >= 0, "root id should be a positive int".format(var)
+        block._insert_op_without_sync(
+            insert_idx,
+            type='c_reduce_sum',
+            inputs={'X': var},
+            outputs={'Out': var},
+            attrs={
+                'ring_id': ring_id,
+                'root_id': root_id,
+                'use_calc_stream': use_calc_stream,
+                OP_ROLE_KEY: op_role
+            })
     return
 
 
+def get_grad_device(grad_name, shard):
+    assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format(
+        grad_name)
+    base_name = None
+    # mind the traversal order 
+    possible_suffixes = [
+        '.cast_fp16@GRAD@MERGED', '.cast_fp16@GRAD', '@GRAD@MERGED', '@GRAD'
+    ]
+    for suffix in possible_suffixes:
+        if suffix in grad_name:
+            base_name = re.sub(suffix, '', grad_name)
+            break
+
+    assert base_name in shard.global_param2device, "[{}] should be a param variable.".format(
+        base_name)
+
+    return shard.global_param2device[base_name]
+
+
+def get_first_check_finite_and_unscale_op_idx(block):
+
+    for idx, op in enumerate(block.ops):
+        if op.type == "check_finite_and_unscale":
+            return idx
+
+    raise ValueError("check_finite_and_unscale does not exist in block")
+
+
 def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root):
     """
     _add_broadcast_ops
@@ -384,6 +470,7 @@ def insert_scale_loss_grad_ops(block, scale=1.0):
                 outputs={'Out': loss_grad_var},
                 attrs={'scale': scale,
                        OP_ROLE_KEY: OpRole.Backward})
+            break
 
 
 def comm_analyse(main_program):
@@ -428,7 +515,7 @@ def comm_analyse(main_program):
                                                       count))
 
 
-def add_sync_comm(program, dist_strategy):
+def add_sync_comm(program, sharding_ring_id):
     """
     When clone a test prog by clone from the sharding main prog, 
     part of the sync_comm op maybe be pruned by mistake, this function
@@ -438,6 +525,7 @@ def add_sync_comm(program, dist_strategy):
     #NOTE (liangjianzhong): only support one comm stream by now, use more than one 
     # comm streams will cause error. should be revise in future.
 
+    assert sharding_ring_id >= 0, "sharding_ring_id should larger than zero"
     block = program.global_block()
     not_sync_vars = set([])
     for op in block.ops:
@@ -448,15 +536,14 @@ def add_sync_comm(program, dist_strategy):
             for input_name in op.desc.input_arg_names():
                 not_sync_vars.remove(input_name)
     if not_sync_vars:
-        for nccl_id in range(dist_strategy.nccl_comm_num):
-            block.append_op(
-                type='c_sync_comm_stream',
-                inputs={'X': list(not_sync_vars)},
-                outputs={'Out': list(not_sync_vars)},
-                attrs={
-                    'ring_id': nccl_id,
-                    'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-                })
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': list(not_sync_vars)},
+            outputs={'Out': list(not_sync_vars)},
+            attrs={
+                'ring_id': sharding_ring_id,
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            })
     return
 
 
@@ -466,9 +553,12 @@ def save_persistables(exe, dirname, main_program, filename=None):
     and part of persistable vars are duplicated and exist in all the ranks with different values.
     This function handles the model saving for sharding training.
     """
+    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+    if main_program._pipeline_opt:
+        main_program = main_program._pipeline_opt['section_program']['program']
 
     def is_opt_vars(var):
-        # NOTE(liangjianzhong): The checks should be updated when add new compatible optimizer
+        # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer
         # now only Momentum and adam are compatible with sharding
         checks = [
             "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
@@ -479,12 +569,18 @@ def save_persistables(exe, dirname, main_program, filename=None):
                 return True
         return False
 
+    def is_gradient_merge_vars(var):
+        # NOTE(JZ-LIANG): to revise save/load logic in framework instead of write this naive rule
+
+        return var.name.endswith("@GradiantMerge")
+
     def is_trainable(var):
         return isinstance(var,
                           paddle.fluid.framework.Parameter) and var.trainable
 
     def sharding_predicate(var):
-        return is_trainable(var) or is_opt_vars(var)
+        return is_trainable(var) or is_opt_vars(var) or is_gradient_merge_vars(
+            var)
 
     if int(os.environ.get('PADDLE_TRAINER_ID', 0)) == 0:
         paddle.fluid.io.save_persistables(
@@ -498,3 +594,42 @@ def save_persistables(exe, dirname, main_program, filename=None):
             filename=None)
 
     return
+
+
+def get_grad_device(grad_name, shard):
+    assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format(
+        grad_name)
+    base_name = None
+    # mind the traversal order 
+    possible_suffixes = ['.cast_fp16@GRAD', '@GRAD']
+    for suffix in possible_suffixes:
+        if suffix in grad_name:
+            base_name = re.sub(suffix, '', grad_name)
+            break
+
+    assert base_name in shard.global_param2device, "[{}] should be a param variable.".format(
+        base_name)
+
+    return shard.global_param2device[base_name]
+
+
+def append_naive_sync(block, sync_var, ring_id):
+    # NOTE (JZ-LIANG) update this to use barrier sync for more elegent logic
+    # sync within global 
+    block.append_op(
+        type="fill_constant",
+        outputs={"Out": sync_var},
+        attrs={
+            "shape": sync_var.shape,
+            "dtype": sync_var.dtype,
+            "value": int(1),
+        })
+    block.append_op(
+        type='c_allreduce_sum',
+        inputs={'X': sync_var},
+        outputs={'Out': sync_var},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            OP_ROLE_KEY: OpRole.Forward
+        })
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
index 2833e8c6dac4be053d60ba63ea67376cf6999d47..ab0c79bca554c6a5c2f688ec9c3f9c1647b21d36 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
@@ -14,6 +14,8 @@
 
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_VAR_KEY
 
+__all__ = []
+
 
 class WeightDecayHelper(object):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index a7f704361d31af5c1535259c62f13ea0cc3d0c3b..82e54a89e104ffa4e0a36ff796c0af07e04c883b 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -12,25 +12,37 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 from paddle.fluid import unique_name, core
 import paddle.fluid as fluid
-
 from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_VAR_KEY, CollectiveHelper
-from paddle.distributed.fleet.meta_optimizers.common import is_backward_op
+from paddle.distributed.fleet.meta_optimizers.common import is_backward_op, is_optimizer_op, is_update_op
 from paddle.distributed.fleet.meta_optimizers.meta_optimizer_base import MetaOptimizerBase
 from paddle.distributed.fleet.meta_optimizers.sharding.shard import Shard, ProgramSegment
 from paddle.distributed.fleet.meta_optimizers.sharding.fp16_helper import FP16Utils
 from paddle.distributed.fleet.meta_optimizers.sharding.weight_decay_helper import WeightDecayHelper
 from paddle.distributed.fleet.meta_optimizers.sharding.gradient_clip_helper import GradientClipHelper
+from .sharding.offload_helper import OffloadHelper
 from paddle.distributed.fleet.meta_optimizers.sharding.prune import ProgramDeps
 from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+from paddle.fluid import layers
+
 import logging
+logger = logging.getLogger(__name__)
+formatter = logging.Formatter(
+    fmt='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
 from functools import reduce
 
-__all__ = ["ShardingOptimizer"]
+__all__ = []
 
 
 class ShardingOptimizer(MetaOptimizerBase):
+    """Sharding Optimizer."""
+
     def __init__(self, optimizer):
         super(ShardingOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -39,6 +51,8 @@ class ShardingOptimizer(MetaOptimizerBase):
             "AMPOptimizer",
             "LarsOptimizer",
             "LambOptimizer",
+            # "ModelParallelOptimizer",
+            # "PipelineOptimizer",
         ]
         self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
         self._main_program = None
@@ -50,6 +64,10 @@ class ShardingOptimizer(MetaOptimizerBase):
         # reduced grads to param name
         self._reduced_grads_to_param = {}
         self._shard = Shard()
+        self._verbose = False
+
+        # use sharding as outer parallelism (e.g. inner:Megatron & outer sharding)
+        self.mp_degree = 1
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -64,7 +82,7 @@ class ShardingOptimizer(MetaOptimizerBase):
 
     def _enable_strategy(self, dist_strategy, context):
         dist_strategy.sharding = True
-        dist_strategy.sharding_configs = {"fuse_broadcast_MB": 32}
+        dist_strategy.sharding_configs = {"segment_broadcast_MB": 32}
 
     def minimize_impl(self,
                       loss,
@@ -75,104 +93,478 @@ class ShardingOptimizer(MetaOptimizerBase):
         # self._nrings = self.user_defined_strategy.nccl_comm_num
         self._nrings_sharding = 1
         self._nrings_dp = 1
-        self._fuse_broadcast_MB = self.user_defined_strategy.sharding_configs[
-            "fuse_broadcast_MB"]
-        self.hybrid_dp = self.user_defined_strategy.sharding_configs[
-            "hybrid_dp"]
+
+        # segment
+        self._sharding_segment_strategy = str(
+            self.user_defined_strategy.sharding_configs[
+                "sharding_segment_strategy"])
+        if self._sharding_segment_strategy == "segment_broadcast_MB":
+            self._broadcast_MB = self.user_defined_strategy.sharding_configs[
+                "segment_broadcast_MB"]
+            assert self._broadcast_MB > 0, "segment size should larger than zero !"
+        elif self._sharding_segment_strategy == "segment_anchors":
+            self._sharding_segment_anchors = self.user_defined_strategy.sharding_configs[
+                "segment_anchors"]
+            assert len(self._sharding_segment_anchors
+                       ) > 0, "you should set the sharding segment anchors !"
+            self._backward_remain_anchors = self._sharding_segment_anchors[:]
+            self._forward_remain_anchors = []
+        else:
+            raise NotImplementedError(
+                "the sharding segment strategy [{}] is not implemented".format(
+                    str(self._sharding_segment_strategy)))
+
+        # parallelism
+        self.sharding_degree = int(self.user_defined_strategy.sharding_configs[
+            "sharding_degree"])
+        assert self.sharding_degree > 0, "sharding degree must be larger than zero"
+        self.mp_degree = int(self.user_defined_strategy.sharding_configs[
+            "mp_degree"])
+        # pipeline setting
+        # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
+        self.pp_degree = int(self.user_defined_strategy.sharding_configs[
+            "pp_degree"])
+        if self.pp_degree > 1:
+            assert self.user_defined_strategy.pipeline == True
+
+        self.dp_degree = int(self.user_defined_strategy.sharding_configs[
+            'dp_degree'])
+        assert self.role_maker._worker_num(
+        ) == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
+            self.role_maker._worker_num(),
+            self.mp_degree,
+            self.sharding_degree,
+            self.pp_degree,
+            self.dp_degree, )
+
+        # FIXME (JZ-LIANG) deprecated hybrid_dp
+        if self.user_defined_strategy.sharding_configs["hybrid_dp"]:
+            logger.warning(
+                "[hybrid_dp] API setting is deprecated. Now when dp_degree >= 2, its will be in hybrid dp mode automatically"
+            )
+            assert self.dp_degree >= 1
+        if self.dp_degree > 1:
+            self.hybrid_dp = True
+        else:
+            self.hybrid_dp = False
+
+        # NOTE (JZ-LIANG) 
+        # there 2 kind of modes for gradient-merge and hybrid-dp in mixed parallism [sharding] and [pipeline].
+        # we distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place according different mode to have best performance:
+        # sharding: communication within node, and therefore should insert within backward segment to overlap with bw calc, conduct every micro step 
+        # pipeline: communication accross nodes, and therefore should insert in update segemnt, conduct just once per global step        
+        self.hybrid_dp_mode = None
+        # dp here is the pure dp as the outest parallelism
+        if self.hybrid_dp:
+            assert self.dp_degree > 1, "hybrid dp is on, but dp degree is [{}]".format(
+                self.dp_degree)
+            if self.pp_degree > 1:
+                self.hybrid_dp_mode = "pp_hybrid_dp"
+            else:
+                assert self.sharding_degree > 1, "by now we only support five kind of hybrid dp: sharding_hybrid_dp, mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp."
+                self.hybrid_dp_mode = "sharding_hybrid_dp"
+
+        # gradient merge
+        self._gradient_merge_acc_step = int(
+            self.user_defined_strategy.sharding_configs[
+                "gradient_merge_acc_step"])
+        self.gradient_merge_mode = None
+        if self.pp_degree <= 1:
+            self.gradient_merge_mode = "sharding_gm"
+            self._grad2merged_grad = dict()
+        else:
+            self.gradient_merge_mode = "pp_gm"
+            self._gradient_merge_acc_step = self.user_defined_strategy.pipeline_configs[
+                'accumulate_steps']
+        if self._gradient_merge_acc_step > 1:
+            logger.info("Gradient merge in [{}], acc step = [{}]".format(
+                self.gradient_merge_mode, self._gradient_merge_acc_step))
+
+        # optimize offload
+        self.optimize_offload = self.user_defined_strategy.sharding_configs[
+            "optimize_offload"]
+
+        # this feature is design for ascend, and should NOT be used in GPU training
+        self.pp_allreduce_in_optimize = self.user_defined_strategy.sharding_configs[
+            "pp_allreduce_in_optimize"]
 
         if self.inner_opt is None:
             raise ValueError(
                 "self.inner_opt of ShardingOptimizer should not be None.")
-        optimize_ops, params_grads = self.inner_opt.minimize(
-            loss, startup_program, parameter_list, no_grad_set)
+
+        if self.pp_degree > 1:
+            pp_optimizer = fluid.optimizer.PipelineOptimizer(
+                self.inner_opt, self._gradient_merge_acc_step)
+            main_program = loss.block.program
+            main_program._pipeline_opt = dict()
+            self.schedule_mode = self.user_defined_strategy.pipeline_configs[
+                'schedule_mode']
+            main_program._pipeline_opt['schedule_mode'] = self.schedule_mode
+            main_program._pipeline_opt[
+                'micro_batch_size'] = self.user_defined_strategy.pipeline_configs[
+                    'micro_batch_size']
+            self.pp_rank_ = self.role_maker._worker_index() // (
+                self.sharding_degree * self.mp_degree) % self.pp_degree
+            main_program._pipeline_opt['local_rank'] = self.pp_rank_
+            main_program._pipeline_opt[
+                'global_rank'] = self.role_maker._worker_index()
+            main_program._pipeline_opt['use_sharding'] = True
+            # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
+            main_program._pipeline_opt['ring_id'] = 20
+            main_program._pipeline_opt['global_ring_id'] = 3
+
+            optimize_ops, params_grads, program_list, self.pipeline_pair, self.pp_ring_map = pp_optimizer.minimize(
+                loss, startup_program, parameter_list, no_grad_set)
+            self.pp_degree = len(program_list)
+        else:
+            optimize_ops, params_grads = self.inner_opt.minimize(
+                loss, startup_program, parameter_list, no_grad_set)
 
         if startup_program is None:
             startup_program = default_startup_program()
-        main_block = loss.block
+
+        if self.pp_degree > 1:
+            startup_program = startup_program._pipeline_opt['startup_program']
+            #main_program = main_program._pipeline_opt['section_program']['program']
+            print("pp_rank:", self.pp_rank_)
+            main_program = program_list[self.pp_rank_]
+            with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
+                f.writelines(str(main_program))
+            main_block = main_program.global_block()
+            new_params_grads = []
+            for param, grad in params_grads:
+                if main_block.has_var(param.name):
+                    new_params_grads.append((param, grad))
+            params_grads = new_params_grads
+
+        else:
+            main_block = loss.block
+
         startup_block = startup_program.global_block()
         self._main_program = main_block.program
         self._startup_program = startup_program
 
-        # step1: set_up
-        self._set_up(params_grads)
+        if self.pp_degree > 1:
+            pp_optimizer._rename_gradient_var_name(main_block)
+            with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
+                f.writelines(str(main_program))
 
-        # step2: split_program
-        self._split_program(main_block)
+        # step0: _init_comm
+        self._init_comm()
 
-        # step3: add broadcast and reduce ops
-        self._add_broadcast_allreduce(main_block)
-        main_block._sync_with_cpp()
-        startup_block._sync_with_cpp()
+        if self.sharding_degree > 1:
+
+            # step1: build shard
+            self._build_shard(params_grads)
+
+            # step2: split_program
+            self._split_program(main_block)
+
+            # step3: add broadcast and reduce ops
+            self._add_broadcast_allreduce(main_block)
+            main_block._sync_with_cpp()
+            startup_block._sync_with_cpp()
+
+            main_block._sync_with_cpp()
+
+            # step4: remove unneeded ops and vars from block
+            self._prune_main_program(main_block)
+            self._prune_startup_program(startup_block)
+
+        if self.pp_degree > 1:
+            # sharding-pp related logic
+            # pp_optimizer._rename_gradient_var_name(main_block)
+            # crop ops
+            if self.sharding_degree > 1:
+                for idx, op in reversed(list(enumerate(main_block.ops))):
+                    if is_update_op(op):
+                        op_role_var = op.attr('op_role_var')
+                        param_name = op_role_var[0]
+                        if not self._shard.has_param(param_name):
+                            main_block._remove_op(idx)
+
+                for idx, op in reversed(list(enumerate(main_block.ops))):
+                    if op.type != 'cast': continue
+                    in_name = op.input_arg_names[0]
+                    if in_name not in self._params: continue
+                    #if self._shard.has_param(param_name): continue
+                    if in_name not in main_block.vars:
+                        main_block._remove_op(idx)
+
+            accumulated_grad_names = pp_optimizer._accumulate_gradients(
+                main_block)
+            # accumulated_grad_names = sorted(accumulated_grad_names)
+            if self.pp_allreduce_in_optimize:
+                print("persistable FP32 grad: ")
+                print(accumulated_grad_names)
+                first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
+                    main_block)
+                insert_reduce_ops(
+                    main_block,
+                    first_optimize_op_index,
+                    self.sharding_ring_id,
+                    accumulated_grad_names,
+                    self._shard,
+                    core.op_proto_and_checker_maker.OpRole.Optimize,
+                    use_calc_stream=True)
+            if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp":
+                first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
+                    main_block)
+                insert_allreduce_ops(
+                    main_block,
+                    first_optimize_op_index,
+                    self.dp_ring_id,
+                    accumulated_grad_names,
+                    core.op_proto_and_checker_maker.OpRole.Optimize,
+                    use_calc_stream=True)
+
+        # if not use sharding, adapt amp/clip, for remain parallelism.
+        # cast --> amp --> clip --> opt
+        if self.sharding_degree <= 1:
+            # amp
+            FP16Utils.sync_amp_check_nan_inf(main_block, self.global_ring_id)
+
+            # clip
+            gradientclip_helper = GradientClipHelper(self.global_ring_id)
+            gradientclip_helper.sync_global_norm(
+                main_block, self.global_ring_id, self.dp_degree)
+
+        # step6: loss div dp_degree 
+        global_dp_degree = self.sharding_degree * self.dp_degree
+        assert int(global_dp_degree) == global_dp_degree
+        if global_dp_degree > 1:
+            insert_scale_loss_grad_ops(main_block, scale=1.0 / global_dp_degree)
 
-        # step4: insert reduce_sum for grad
-        insert_scale_loss_grad_ops(
-            main_block, scale=1.0 / self.role_maker._worker_num())
         main_block._sync_with_cpp()
 
-        # step5: remove unneeded ops and vars from block
-        self._prune_main_program(main_block)
-        self._prune_startup_program(startup_block)
+        # TODO(wangxi): add optimize offload
+        # opt offload should be enable while gradient merge is enable && acc_step is quite large (e.g. >> 100) 
+        # sync its memcpy could not be overlap with calc, otherwise it will slower down training severely. 
+        if self.optimize_offload:
+            logger.info("Sharding with optimize offload !")
+            offload_helper = OffloadHelper()
+            offload_helper.offload(main_block, startup_block)
+            offload_helper.offload_fp32param(main_block, startup_block)
+
+        # step6: (optional) sharding gradient merge
+        if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
+            self._sharding_gradient_merge(main_block)
+
+        # # check op dependecy
+        # FIXME (JZ-LIANG) enable checking in future.
+        # check_broadcast(main_block)
+        # check_allreduce_sum(main_block, self._shard, self.sharding_ring_id,
+        #                     self.dp_ring_id)
 
-        # check op dependecy
-        check_broadcast(main_block)
-        check_allreduce_sum(main_block, self._shard, self.dp_ring_id)
-        self._wait()
+        if self.hybrid_dp:
+            # NOTE(JZ-LIANG) ensure in both sharding_hybrid_dp & pp_hybrid_dp 
+            # init param broadcast should be called after startup pruning             
+            self._initialization_broadcast(startup_block)
+
+        with open("start_sharding_%d" % self.role_maker._worker_index(),
+                  'w') as f:
+            f.writelines(str(startup_block.program))
+        with open("main_sharding_%d" % self.role_maker._worker_index(),
+                  'w') as f:
+            f.writelines(str(main_block.program))
+
+        if core.is_compiled_with_cuda():
+            self._wait()
         return optimize_ops, params_grads
 
-    def _set_up(self, params_grads):
-        # step 1: initialize nccl
-        self.global_word_size = self.role_maker._worker_num()
-        self.global_rank = self.role_maker._worker_index()
-        self.endpoints = self.role_maker._get_trainer_endpoints()
-        self.current_endpoint = self.endpoints[self.global_rank]
-        self._collective_helper = CollectiveHelper(self.role_maker,
-                                                   self._nrings_sharding)
+    def _init_comm(self):
+
         # config sharding & dp groups
-        self._init_comm()
-        # sharding
+        self._build_groups()
+
+        # sync var
+        startup_block = self._startup_program.global_block()
+        self.startup_prog_sync_var = startup_block.create_var(
+            name="startup_prog_sync_var",
+            shape=[1],
+            dtype=core.VarDesc.VarType.INT32,
+            persistable=False)
+
+        # global ring
         self._collective_helper._init_communicator(
-            self._startup_program, self.current_endpoint,
-            self.sharding_group_endpoints, self.sharding_rank,
-            self.sharding_ring_id, True)
-        # dp
-        if self.hybrid_dp:
+            self._startup_program,
+            self.current_endpoint,
+            self.global_endpoints,
+            self.global_rank,
+            self.global_ring_id,
+            False,
+            global_ring_id=self.global_ring_id,
+            sync=False)
+        append_naive_sync(startup_block, self.startup_prog_sync_var,
+                          self.global_ring_id)
+
+        # mp ring
+        if self.mp_degree > 1:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.mp_group_endpoints,
+                self.mp_rank,
+                self.mp_ring_id,
+                False,
+                global_ring_id=self.global_ring_id,
+                sync=False)
+            append_naive_sync(startup_block, self.startup_prog_sync_var,
+                              self.global_ring_id)
+
+        # sharding ring
+        if self.sharding_degree > 1:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.sharding_group_endpoints,
+                self.sharding_rank,
+                self.sharding_ring_id,
+                False,
+                global_ring_id=self.global_ring_id,
+                sync=False)
+            append_naive_sync(startup_block, self.startup_prog_sync_var,
+                              self.global_ring_id)
+
+        # pp ring
+        if self.pp_degree > 1:
+            if self.schedule_mode == 'F-then-B':  # GPipe
+                self._collective_helper._init_communicator(
+                    self._startup_program,
+                    self.current_endpoint,
+                    self.pp_group_endpoints,
+                    self.pp_rank,
+                    self.pp_ring_id,
+                    False,
+                    global_ring_id=self.global_ring_id,
+                    sync=False)
+                # append_naive_sync(startup_block, self.startup_prog_sync_var,
+                #                   self.global_ring_id)
+                self._collective_helper._init_communicator(
+                    self._startup_program,
+                    self.current_endpoint,
+                    self.pp_group_endpoints,
+                    self.pp_rank,
+                    self.pp_ring_id + 2,
+                    False,
+                    global_ring_id=self.global_ring_id,
+                    sync=False)
+                # append_naive_sync(startup_block, self.startup_prog_sync_var,
+                #                   self.global_ring_id)
+            else:
+                assert self.schedule_mode == '1F1B'
+                for pair in self.pipeline_pair:
+                    pair_key = pair[0] * 1000 + pair[1]
+                    ring_id = self.pp_ring_map[pair_key]
+                    print("pp pair:{}, ring_id: {}".format(pair, ring_id))
+                    if self.pp_rank not in pair: continue
+                    pp_group_endpoints = [
+                        self.pp_group_endpoints[pair[0]],
+                        self.pp_group_endpoints[pair[1]],
+                    ]
+                    if pair[0] < pair[1]:
+                        start_ring_id = self.pp_ring_id + pair[1] - pair[0] - 1
+                    else:
+                        start_ring_id = self.pp_ring_id + 2 + pair[0] - pair[
+                            1] - 1
+                    pp_rank = 0 if self.pp_rank == pair[0] else 1
+                    self._collective_helper._init_communicator(
+                        self._startup_program,
+                        self.current_endpoint,
+                        pp_group_endpoints,
+                        pp_rank,
+                        ring_id,
+                        False,
+                        global_ring_id=self.global_ring_id,
+                        sync=False)
+                    # append_naive_sync(startup_block, self.startup_prog_sync_var,
+                    #                   self.global_ring_id)
+
+                # TODO (JZ-LIANG) to unify this shit 
+            assert self.pp_rank_ == self.pp_rank, "pp rank for pp opt [{}], pp rank for sharding opt [{}]".format(
+                self.pp_rank_, self.pp_rank)
+
+        # pure dp ring
+        if self.dp_degree > 1:
             self._collective_helper._init_communicator(
-                self._startup_program, self.current_endpoint,
-                self.dp_group_endpoints, self.dp_rank, self.dp_ring_id, True)
+                self._startup_program,
+                self.current_endpoint,
+                self.dp_group_endpoints,
+                self.dp_rank,
+                self.dp_ring_id,
+                False,
+                global_ring_id=self.global_ring_id,
+                sync=False)
+            append_naive_sync(startup_block, self.startup_prog_sync_var,
+                              self.global_ring_id)
 
-        startup_block = self._startup_program.global_block()
         startup_block._sync_with_cpp()
 
+    def _build_shard(self, params_grads):
         # step 2: split params
         self._params = set([x[0].name for x in params_grads])
         self._shard.setup(params_grads, self.sharding_rank,
-                          self.sharding_group_size)
+                          self.sharding_degree)
 
         # step 3: get broadcast vars
         self._broadcast_vars = self._shard.find_broadcast_params(
             self._main_program.global_block())
 
     def _wait(self, ):
-        endpoints = self.role_maker._get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker._worker_index()]
-        if self.role_maker._worker_index() == 0:
+        endpoints = self.global_endpoints[:]
+        current_endpoint = endpoints[self.global_rank]
+        if self.global_rank == 0:
             self._collective_helper._wait(current_endpoint, endpoints)
 
+    def collect_segment(self, segment, op_idx, block):
+        segment._start_idx = op_idx + 1
+        self._segments.insert(0, segment)
+        new_segment = ProgramSegment(block)
+        new_segment._end_idx = op_idx + 1
+
+        return new_segment
+
     def _split_program(self, block):
         for op_idx, op in reversed(list(enumerate(block.ops))):
             if int(op.attr('op_role')) != int(OpRole.Optimize):
                 last_backward_op_idx = op_idx + 1
                 break
+
+        var2broadcast_time = dict()
         segment = ProgramSegment(block)
         segment._end_idx = last_backward_op_idx
         for op_idx in reversed(range(last_backward_op_idx)):
             op = block.ops[op_idx]
             assert (int(op.attr('op_role')) != int(OpRole.Optimize))
-            if segment._param_mem >= self._fuse_broadcast_MB:
-                segment._start_idx = op_idx + 1
-                self._segments.insert(0, segment)
-                segment = ProgramSegment(block)
-                segment._end_idx = op_idx + 1
+            if self._sharding_segment_strategy == "segment_broadcast_MB":
+                if segment._param_mem >= self._broadcast_MB:
+                    segment = self.collect_segment(segment, op_idx, block)
+
+            elif self._sharding_segment_strategy == "segment_anchors":
+                if int(op.attr('op_role')) == int(OpRole.Backward):
+                    for input_name in op.desc.input_arg_names():
+
+                        # NOTE (JZ-LIANG) naive rule to support amp, if amp change, should modify here accordingly
+                        if self.user_defined_strategy.amp:
+                            if ".cast_fp16@GRAD" not in input_name:
+                                continue
+                            else:
+                                input_name = input_name[:input_name.find(
+                                    ".cast_fp16@GRAD")]
+
+                        if input_name in self._backward_remain_anchors:
+                            segment = self.collect_segment(segment, op_idx,
+                                                           block)
+                            assert input_name not in self._forward_remain_anchors, "segment anchor [{}] met twice !".format(
+                                input_name)
+                            self._backward_remain_anchors.remove(input_name)
+                            self._forward_remain_anchors.append(input_name)
+                elif int(op.attr('op_role')) == int(OpRole.Forward):
+                    for output_name in op.desc.output_arg_names():
+                        if output_name in self._forward_remain_anchors:
+                            segment = self.collect_segment(segment, op_idx,
+                                                           block)
+                            self._forward_remain_anchors.remove(output_name)
 
             # find broadcast vars
             for input_name in op.desc.input_arg_names():
@@ -190,6 +582,21 @@ class ShardingOptimizer(MetaOptimizerBase):
                     broadcast_var_name = unique_name.generate(input_name +
                                                               "@BroadCast")
                     segment._fill_constant_vars.append(broadcast_var_name)
+
+                # (JZ-LIANG) should use Param base name ?
+                broadcast_var_base_name = input_name
+                if "subprog" in broadcast_var_base_name:
+                    # remove suffix
+                    broadcast_var_base_name = broadcast_var_base_name[:
+                                                                      broadcast_var_base_name.
+                                                                      find(
+                                                                          ".subprog"
+                                                                      )]
+
+                var2broadcast_time[
+                    broadcast_var_base_name] = var2broadcast_time.get(
+                        broadcast_var_base_name, 0) + 1
+
                 segment._param2broadcast[input_name] = broadcast_var_name
                 segment._broadcast_vars.append((broadcast_var_name,
                                                 self._shard.device(input_name)))
@@ -197,17 +604,22 @@ class ShardingOptimizer(MetaOptimizerBase):
                     self._main_program.global_block().var(input_name))
 
             # find reduce vars
-            if is_backward_op(op) and \
-                    OP_ROLE_VAR_KEY in op.attr_names:
-                op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
-                if len(op_role_var) != 0:
-                    assert len(op_role_var) % 2 == 0
-                    for i in range(0, len(op_role_var), 2):
-                        param, reduced_grad = op_role_var[i], op_role_var[i + 1]
-                        segment._allreduce_vars.append(reduced_grad)
-                        assert (
-                            reduced_grad not in self._reduced_grads_to_param)
-                        self._reduced_grads_to_param[reduced_grad] = param
+            if self.pp_degree > 1 and self.pp_allreduce_in_optimize:
+                # place pipeline gradient allreduce in optimize
+                pass
+            else:
+                if is_backward_op(op) and \
+                        OP_ROLE_VAR_KEY in op.attr_names:
+                    op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
+                    if len(op_role_var) != 0:
+                        assert len(op_role_var) % 2 == 0
+                        for i in range(0, len(op_role_var), 2):
+                            param, reduced_grad = op_role_var[i], op_role_var[
+                                i + 1]
+                            segment._allreduce_vars.append(reduced_grad)
+                            assert (reduced_grad not in
+                                    self._reduced_grads_to_param)
+                            self._reduced_grads_to_param[reduced_grad] = param
 
             # find cast op
             if FP16Utils.is_fp16_cast_op(block, op, self._params):
@@ -219,6 +631,30 @@ class ShardingOptimizer(MetaOptimizerBase):
         if segment._param_mem > 0:
             segment._start_idx = 0
             self._segments.insert(0, segment)
+
+        if self._sharding_segment_strategy == "segment_anchors":
+            assert len(
+                self._forward_remain_anchors) == 0, "remain anchors {}".format(
+                    self._forward_remain_anchors)
+            assert len(
+                self._backward_remain_anchors) == 0, "remain anchors {}".format(
+                    self._backward_remain_anchors)
+
+        if self._verbose:
+            for varname in sorted(
+                    var2broadcast_time, key=var2broadcast_time.get,
+                    reverse=True):
+                logger.info("Sharding broadcast: [{}] times [{}]".format(
+                    var2broadcast_time[varname], varname))
+            for idx_ in range(len(self._segments)):
+                logger.info("segment [{}] :".format(idx_))
+                logger.info("start op: [{}]  [{}]".format(block.ops[
+                    self._segments[idx_]._start_idx].desc.type(), block.ops[
+                        self._segments[idx_]._start_idx].desc.input_arg_names(
+                        )))
+                logger.info("end   op: [{}]  [{}]".format(block.ops[
+                    self._segments[idx_]._end_idx].desc.type(), block.ops[
+                        self._segments[idx_]._end_idx].desc.input_arg_names()))
         return
 
     def _prune_main_program(self, block):
@@ -234,10 +670,21 @@ class ShardingOptimizer(MetaOptimizerBase):
         """
         weightdecay_helper = WeightDecayHelper()
         weightdecay_helper.prune_weight_decay(block, self._shard)
+        # NOTE (JZ-LIANG) the sync of FoundInfinite should among one entire Model Parallelism
+        # group. and each Data Parallelism group should have its own sync of FoundInfinite
+        # amp could use global group for sync
         FP16Utils.prune_fp16(block, self._shard, self._reduced_grads_to_param,
-                             self.sharding_ring_id)
-        gradientclip_helper = GradientClipHelper(self.sharding_ring_id)
-        gradientclip_helper.prune_gradient_clip(block, self._shard)
+                             self.global_ring_id)
+        # clipbyglobalnorm should only use the Model paramllelism group (mp-sharding-pp)
+        if self.mp_degree * self.pp_degree == 1:
+            # separate the sharding-hybrid senario to keep the accuracy
+            gradientclip_helper = GradientClipHelper(self.sharding_ring_id)
+            gradientclip_helper.prune_gradient_clip(
+                block, self._shard, pure_dp_degree=1)
+        else:
+            gradientclip_helper = GradientClipHelper(self.global_ring_id)
+            gradientclip_helper.prune_gradient_clip(
+                block, self._shard, pure_dp_degree=self.dp_degree)
 
         # build prog deps
         reduced_grads = []
@@ -264,8 +711,13 @@ class ShardingOptimizer(MetaOptimizerBase):
         # Prune
         for idx, op in reversed(list(enumerate(block.ops))):
             if op.type in [
-                    "c_allreduce_sum", "c_sync_comm_stream",
-                    "c_calc_comm_stream", "c_gen_nccl_id", "c_comm_init"
+                    "c_allreduce_sum",
+                    "c_sync_comm_stream",
+                    "c_calc_comm_stream",
+                    "c_gen_nccl_id",
+                    "c_comm_init",
+                    'send_v2',
+                    'recv_v2',
             ]:
                 pass
             elif op.type == "conditional_block":
@@ -302,30 +754,76 @@ class ShardingOptimizer(MetaOptimizerBase):
                 if program_deps.should_remove_op(idx):
                     program_deps.remove_op(idx)
 
+        # NOTE (JZ-LIANG) revise and unify logic here
+        # sharding support fp16_allreduce logic            
+        block._sync_with_cpp()
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type == 'concat' and is_optimizer_op(op):
+                # remove inputs that not on this card
+                reserved_x = []
+                for var_name in op.desc.input("X"):
+                    if block.has_var(var_name): reserved_x.append(var_name)
+                op.desc.set_input('X', reserved_x)
         block._sync_with_cpp()
         return
 
     def _add_broadcast_allreduce(self, block):
         """
-        _add_broadcast_allreduce
+        add broadcast allreduce op
+        if enable gradient_merge, insert related ops
+
+        if combined with pipeline(grad accumulate), 
+        the grad allreduce should be done in optimize role
         """
         if len(self._segments) < 1:
             return
         # sharding
+        if self.pp_degree > 1 and self.pp_allreduce_in_optimize:
+            for idx in range(len(self._segments)):
+                assert len(self._segments[idx]._allreduce_vars) == 0
+
+        # NOTE (JZ-LIANG) revise and unify logic here
+        # fix the _end_idx for segments[-1] if pp is used.
+        new_end_idx = self._segments[-1]._end_idx
+        for idx in range(self._segments[-1]._end_idx - 1,
+                         self._segments[-1]._start_idx - 1, -1):
+            op = block.ops[idx]
+            if op.type == "fill_constant" or op.type == "sum":
+                if "MERGED" in op.output_arg_names[0]: new_end_idx = idx + 1
+            elif op.type == "cast":
+                if "@TMP" in op.output_arg_names[0]: new_end_idx = idx + 1
+        self._segments[-1]._end_idx = new_end_idx
+
         if self._segments[-1]._allreduce_vars:
             shard_allredue_vars = self._shard.filter_grads(self._segments[-1]
                                                            ._allreduce_vars)
-            if self.hybrid_dp and len(shard_allredue_vars) >= 1:
-                insert_sync_comm_ops(block, self._segments[-1]._end_idx,
-                                     self.dp_ring_id, shard_allredue_vars)
-                insert_allreduce_ops(block, self._segments[-1]._end_idx,
-                                     self.dp_ring_id, shard_allredue_vars)
+            if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+                if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
+                        shard_allredue_vars) >= 1:
+                    insert_sync_comm_ops(block, self._segments[-1]._end_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+                    insert_allreduce_ops(block, self._segments[-1]._end_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+            # gradient merge 
+            elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
+                self.create_persistable_gradients_and_insert_merge_ops(
+                    block,
+                    self._startup_program.global_block(),
+                    self._segments[-1]._end_idx, shard_allredue_vars,
+                    self._shard)
+
             insert_sync_comm_ops(block, self._segments[-1]._end_idx,
                                  self.sharding_ring_id,
                                  self._segments[-1]._allreduce_vars)
-            insert_allreduce_ops(block, self._segments[-1]._end_idx,
-                                 self.sharding_ring_id,
-                                 self._segments[-1]._allreduce_vars)
+            # allreduce --> reduce 
+            insert_reduce_ops(
+                block,
+                self._segments[-1]._end_idx,
+                self.sharding_ring_id,
+                self._segments[-1]._allreduce_vars,
+                self._shard,
+                op_role=OpRole.Backward,
+                use_calc_stream=False)
 
         for idx, segment in reversed(list(enumerate(self._segments))):
             allreduce_vars = self._segments[
@@ -364,19 +862,32 @@ class ShardingOptimizer(MetaOptimizerBase):
 
             # step2: add Sync ops
             shard_allredue_vars = self._shard.filter_grads(allreduce_vars)
-            if self.hybrid_dp and len(shard_allredue_vars) >= 1:
-                insert_sync_comm_ops(block, segment._end_idx, self.dp_ring_id,
-                                     shard_allredue_vars)
 
+            if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+                if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
+                        shard_allredue_vars) >= 1:
+                    insert_sync_comm_ops(block, segment._end_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+
+                    broad_cast_vars = [x[0] for x in broadcast_vars]
+                    if len(broad_cast_vars) > 0:
+                        insert_sync_comm_ops(block, segment._end_idx,
+                                             self.sharding_ring_id,
+                                             broad_cast_vars)
+                else:
+                    comm_dep_vars = allreduce_vars + [
+                        x[0] for x in broadcast_vars
+                    ]
+                    if len(comm_dep_vars) > 0:
+                        insert_sync_comm_ops(block, segment._end_idx,
+                                             self.sharding_ring_id,
+                                             comm_dep_vars)
+            # gradient merge
+            elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 broad_cast_vars = [x[0] for x in broadcast_vars]
                 if len(broad_cast_vars) > 0:
                     insert_sync_comm_ops(block, segment._end_idx,
                                          self.sharding_ring_id, broad_cast_vars)
-            else:
-                comm_dep_vars = allreduce_vars + [x[0] for x in broadcast_vars]
-                if len(comm_dep_vars) > 0:
-                    insert_sync_comm_ops(block, segment._end_idx,
-                                         self.sharding_ring_id, comm_dep_vars)
 
             calc_dep_vars = fill_constant_vars + [
                 k for k, v in cast_ops.items()
@@ -394,18 +905,41 @@ class ShardingOptimizer(MetaOptimizerBase):
             insert_cast_ops(block, segment._end_idx, cast_ops)
 
             # step5: add broadcast ops
+            # gradient merge
+            if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
+                self.create_persistable_gradients_and_insert_merge_ops(
+                    block,
+                    self._startup_program.global_block(), segment._start_idx,
+                    shard_allredue_vars, self._shard)
+
             insert_broadcast_ops(block, segment._start_idx,
                                  self.sharding_ring_id, broadcast_vars)
+
             # step6: add all_reduce ops
             # dp
-            if self.hybrid_dp and len(shard_allredue_vars) >= 1:
-                insert_allreduce_ops(block, segment._start_idx, self.dp_ring_id,
-                                     shard_allredue_vars)
+            if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+                if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
+                        shard_allredue_vars) >= 1:
+                    insert_allreduce_ops(block, segment._start_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+                    insert_sync_comm_ops(block, segment._start_idx,
+                                         self.sharding_ring_id, allreduce_vars)
+            # gradient merge
+            elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 insert_sync_comm_ops(block, segment._start_idx,
                                      self.sharding_ring_id, allreduce_vars)
             # sharding
-            insert_allreduce_ops(block, segment._start_idx,
-                                 self.sharding_ring_id, allreduce_vars)
+            # allreduce --> reduce 
+            # TODO temp change
+            if len(allreduce_vars) > 0:
+                insert_reduce_ops(
+                    block,
+                    segment._start_idx,
+                    self.sharding_ring_id,
+                    allreduce_vars,
+                    self._shard,
+                    op_role=OpRole.Backward,
+                    use_calc_stream=False)
 
             block._sync_with_cpp()
 
@@ -456,59 +990,472 @@ class ShardingOptimizer(MetaOptimizerBase):
             block._remove_var(var_name, sync=False)
         block._sync_with_cpp()
 
-    def _init_comm(self):
-
-        if self.hybrid_dp:
-            self.sharding_group_size = self.user_defined_strategy.sharding_configs[
-                "sharding_group_size"]
-            self.sharding_ring_id = 0
-            self.sharding_rank = self.global_rank % self.sharding_group_size
-
-            self.dp_group_size = self.global_word_size // self.sharding_group_size
-            self.dp_rank = self.global_rank // self.sharding_group_size
-            self.dp_ring_id = self.sharding_rank + 1
-
-            self.sharding_group_endpoints = [
-                ep for idx, ep in enumerate(self.endpoints)
-                if (idx // self.sharding_group_size) == self.dp_rank
-            ]
-            self.dp_group_endpoints = [
-                ep for idx, ep in enumerate(self.endpoints)
-                if (idx % self.sharding_group_size) == self.sharding_rank
+    def _build_groups(self):
+        """
+        pre-assign ring ids
+            mp: 0
+            sharding: 1
+            pure-dp: 2
+            global: 3
+            pp: >= 20
+        if one parallelism is not enable: -1
+        and only support parallelism hierarchy: mp --> sharding --> pp --> dp        
+        """
+        # step 1: initialize nccl
+        self.global_word_size = self.role_maker._worker_num()
+        self.global_rank = self.role_maker._worker_index()
+        self.global_endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.global_endpoints[self.global_rank]
+        self._collective_helper = CollectiveHelper(
+            self.role_maker, nrings=self._nrings_sharding)
+        assert self.global_word_size % self.mp_degree == 0, \
+            "global_word_size: {} should be divisible to the mp_degree: {}".format(self.global_word_size, self.mp_degree)
+        assert self.global_word_size % self.sharding_degree == 0, \
+            "global_word_size: {} should be divisible to the sharding_degree: {}".format(self.global_word_size, self.sharding_degree)
+        assert self.global_word_size % self.pp_degree == 0, \
+            "global_word_size: {} should be divisible to the pp_degree: {}".format(self.global_word_size, self.pp_degree)
+        assert self.global_word_size % self.dp_degree == 0, \
+            "global_word_size: {} should be divisible to the dp_degree: {}".format(self.global_word_size, self.dp_degree)
+
+        # mp group
+        if self.mp_degree > 1:
+            self.mp_ring_id = 0
+            self.mp_rank = self.global_rank % self.mp_degree
+            self.mp_group_id = self.global_rank // self.mp_degree
+            self.mp_group_endpoints = [
+                ep for idx, ep in enumerate(self.global_endpoints)
+                if idx // self.mp_degree == self.mp_group_id
             ]
-            assert self.global_word_size > self.sharding_group_size, \
-                "global_word_size: {} should be larger than sharding_group_size: {}".format(self.global_word_size, self.sharding_group_size)
-            assert self.global_word_size % self.sharding_group_size == 0, \
-                "global_word_size: {} should be divisible to the sharding_group_size: {}".format(self.global_word_size, self.sharding_group_size)
-            assert self.dp_group_size *  self.sharding_group_size == self.global_word_size, \
-                "global_word_size: {} should be equal to the product of sharding_group_size: {} and dp_group_size: {}".format(
-                self.global_word_size,
-                self.sharding_group_size,
-                self.dp_group_size)
-
-            logging.info("Using Sharing&DP mode !")
+            assert self.current_endpoint in self.mp_group_endpoints
+            assert len(
+                self.mp_group_endpoints
+            ) == self.mp_degree, "num of mp worker in group is [{}], but mp group size is [{}]".format(
+                len(self.mp_group_endpoints), self.mp_degree)
+        else:
+            self.mp_degree = 1
+            self.mp_ring_id = -1
+            self.mp_rank = -1
+            self.mp_group_id = -1
+            self.mp_group_endpoints = []
+
+        # sharding 
+        if self.sharding_degree > 1:
+            self.sharding_ring_id = 1
+            self.sharding_rank = (self.global_rank //
+                                  self.mp_degree) % self.sharding_degree
+            self.sharding_group_id = self.global_rank // (self.mp_degree *
+                                                          self.sharding_degree)
+            # mp + sharding + ...
+            if self.mp_degree > 1:
+                self.sharding_group_endpoints = [
+                    ep for idx, ep in enumerate(self.global_endpoints)
+                    if (idx // (self.mp_degree * self.sharding_degree)) == self.
+                    sharding_group_id and idx % self.mp_degree == self.mp_rank
+                ]
+            # sharding + ...    
+            else:
+                self.sharding_group_endpoints = [
+                    ep for idx, ep in enumerate(self.global_endpoints)
+                    if (idx // (self.mp_degree * self.sharding_degree)
+                        ) == self.sharding_group_id
+                ]
+            assert self.current_endpoint in self.sharding_group_endpoints
+        else:
+            self.sharding_degree = 1
+            self.sharding_ring_id = -1
+            self.sharding_rank = -1
+            self.sharding_group_id = -1
+            self.sharding_group_endpoints = []
+
+        # pp
+        if self.pp_degree > 1:
+            self.pp_ring_id = 20
+            self.pp_rank = self.global_rank // (self.sharding_degree *
+                                                self.mp_degree) % self.pp_degree
+            # (NOTE): Already adjust for (outter-pure) dp
+            self.pp_group_id = self.global_rank // (
+                self.mp_degree * self.sharding_degree * self.pp_degree)
+            pp_first_stage_idx = self.global_rank % (
+                self.sharding_degree * self.mp_degree) + self.pp_group_id * (
+                    self.mp_degree * self.sharding_degree * self.pp_degree)
+            pp_stage_offset = self.sharding_degree * self.mp_degree
+            self.pp_group_endpoints = []
+            for i in range(self.pp_degree):
+                self.pp_group_endpoints.append(self.global_endpoints[
+                    pp_first_stage_idx + pp_stage_offset * i])
+            assert self.current_endpoint in self.pp_group_endpoints
+        else:
+            self.pp_degree = 1
+            self.pp_ring_id = -1
+            self.pp_rank = -1
+            self.pp_group_id = -1
+            self.pp_group_endpoints = []
+
+        # outter-pure-dp group
+        # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism
+        # e.g. mp-sharding-pp-dp
+        # sharding-hybrid-dp as one senario of outter-pure-dp 
+        assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format(
+            self.mp_degree, self.sharding_degree, self.pp_degree,
+            self.dp_degree, self.global_word_size)
+
+        if self.dp_degree > 1:
+            self.dp_ring_id = 2
+            self.dp_rank = self.global_rank // (self.sharding_degree *
+                                                self.mp_degree * self.pp_degree)
+            dp_first_rank_idx = self.global_rank % (
+                self.sharding_degree * self.mp_degree * self.pp_degree)
+            dp_offset = (self.sharding_degree * self.mp_degree * self.pp_degree)
+            self.dp_group_endpoints = []
+            for i in range(self.dp_degree):
+                self.dp_group_endpoints.append(self.global_endpoints[
+                    dp_first_rank_idx + dp_offset * i])
+            assert self.current_endpoint in self.dp_group_endpoints
+            logger.info("Hybrid DP mode turn on !")
         else:
-            self.sharding_ring_id = 0
-            self.sharding_rank = self.global_rank
-            self.sharding_group_size = self.role_maker._worker_num()
-            self.sharding_group_endpoints = self.endpoints
             self.dp_ring_id = -1
             self.dp_rank = -1
-            self.dp_group_size = None
-            self.dp_group_endpoints = None
-
-            logging.info("Using Sharing alone mode !")
-
-        logging.info("global word size: {}".format(self.global_word_size))
-        logging.info("global rank: {}".format(self.global_rank))
-        logging.info("sharding group_size: {}".format(self.sharding_group_size))
-        logging.info("sharding rank: {}".format(self.sharding_rank))
-        logging.info("dp group size: {}".format(self.dp_group_size))
-        logging.info("dp rank: {}".format(self.dp_rank))
-        logging.info("current endpoint: {}".format(self.current_endpoint))
-        logging.info("sharding group endpoints: {}".format(
+            self.dp_group_endpoints = []
+
+        # global group
+        # use for gen_nccl_comm_sync, amp check nan inf, clip by global norm
+        # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be devided by dp_degree
+        self.global_ring_id = 3
+
+        logger.info("global word size: {}".format(self.global_word_size))
+        logger.info("global rank: {}".format(self.global_rank))
+        logger.info("global endpoints: {}".format(self.global_endpoints))
+        logger.info("global ring id: {}".format(self.global_ring_id))
+        logger.info("#####" * 6)
+
+        logger.info("mp group size: {}".format(self.mp_degree))
+        logger.info("mp rank: {}".format(self.mp_rank))
+        logger.info("mp group id: {}".format(self.mp_group_id))
+        logger.info("mp group endpoints: {}".format(self.mp_group_endpoints))
+        logger.info("mp ring id: {}".format(self.mp_ring_id))
+        logger.info("#####" * 6)
+
+        logger.info("sharding group size: {}".format(self.sharding_degree))
+        logger.info("sharding rank: {}".format(self.sharding_rank))
+        logger.info("sharding group id: {}".format(self.sharding_group_id))
+        logger.info("sharding group endpoints: {}".format(
             self.sharding_group_endpoints))
-        logging.info("dp group endpoints: {}".format(self.dp_group_endpoints))
-        logging.info("global word endpoints: {}".format(self.endpoints))
+        logger.info("sharding ring id: {}".format(self.sharding_ring_id))
+        logger.info("#####" * 6)
+
+        logger.info("pp group size: {}".format(self.pp_degree))
+        logger.info("pp rank: {}".format(self.pp_rank))
+        logger.info("pp group id: {}".format(self.pp_group_id))
+        logger.info("pp group endpoints: {}".format(self.pp_group_endpoints))
+        logger.info("pp ring id: {}".format(self.pp_ring_id))
+        logger.info("#####" * 6)
+
+        logger.info("pure dp group size: {}".format(self.dp_degree))
+        logger.info("pure dp rank: {}".format(self.dp_rank))
+        logger.info("pure dp group endpoints: {}".format(
+            self.dp_group_endpoints))
+        logger.info("pure dp ring id: {}".format(self.dp_ring_id))
+        logger.info("#####" * 6)
 
         return
+
+    def _initialization_broadcast(self, startup_block):
+        """
+        this funtion is to ensure the initialization between dp group to be 
+        identical when hybrid-dp is used.
+        """
+        params = []
+        for param in startup_block.iter_parameters():
+            params.append(param)
+            startup_block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': self.dp_ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+        startup_block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': params},
+            outputs={'Out': params},
+            attrs={'ring_id': self.dp_ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+        # sync within global group
+        append_naive_sync(startup_block, self.startup_prog_sync_var,
+                          self.global_ring_id)
+
+    # sharding gradient merge
+    def create_persistable_gradients_and_insert_merge_ops(
+            self, main_block, startup_block, insert_idx, grad_names, shard):
+
+        for grad_name in grad_names:
+            assert get_grad_device(
+                grad_name, shard
+            ) == shard.worker_idx, "try to merge gradient not belong to current shard: [{}]".format(
+                grad_name)
+            persistable_grad_name = grad_name + '@GradiantMerge'
+            assert grad_name not in self._grad2merged_grad, "grad [{}] already in grad2merged_grad, maybe you meet sharing weight case !".format(
+                grad_name)
+            self._grad2merged_grad[grad_name] = persistable_grad_name
+            grad_var = main_block.var(grad_name)
+            # create var
+            gradient_merge_var = main_block.create_var(
+                name=persistable_grad_name,
+                shape=grad_var.shape,
+                dtype=grad_var.dtype,
+                persistable=True)
+            startup_gradient_merge_var = startup_block.create_var(
+                name=persistable_grad_name,
+                shape=grad_var.shape,
+                dtype=grad_var.dtype,
+                persistable=True)
+
+            # merge gradient
+            main_block._insert_op_without_sync(
+                insert_idx,
+                type="elementwise_add",
+                inputs={'X': grad_name,
+                        'Y': gradient_merge_var},
+                outputs={'Out': gradient_merge_var},
+                attrs={
+                    'axis': -1,
+                    'use_mkldnn': False,
+                    OP_ROLE_KEY: OpRole.Backward
+                })
+
+            # startup initialization
+            startup_block.append_op(
+                type="fill_constant",
+                outputs={"Out": startup_gradient_merge_var},
+                attrs={
+                    "shape": grad_var.shape,
+                    "dtype": grad_var.dtype,
+                    "value": float(0),
+                })
+
+        main_block._sync_with_cpp()
+        startup_block._sync_with_cpp()
+
+    def _create_gm_cond(self, main_block):
+        # Add const var
+        acc_step_var = layers.create_global_var(
+            name="gradient_merge_acc_step",
+            shape=[1],
+            value=int(self._gradient_merge_acc_step),
+            dtype='int32',
+            persistable=True,
+            force_cpu=True)
+
+        zero_var = layers.create_global_var(
+            name="gradient_merge_zero",
+            shape=[1],
+            value=int(0),
+            dtype='int32',
+            persistable=True,
+            force_cpu=True)
+
+        # Add step var & cond var
+        current_step_var = layers.create_global_var(
+            name="gradient_merge_current_step",
+            shape=[1],
+            value=int(0),
+            dtype='int32',
+            persistable=True,
+            force_cpu=True)
+
+        cond_var = layers.create_global_var(
+            name="gradient_merge_cond",
+            shape=[1],
+            value=bool(0),
+            dtype='bool',
+            persistable=False,
+            force_cpu=True)
+
+        with device_guard("cpu"):
+            # step_var = (step_var + 1) % k_step
+            main_block.append_op(
+                type='increment',
+                inputs={'X': [current_step_var]},
+                outputs={'Out': [current_step_var]},
+                attrs={'step': float(1),
+                       OP_ROLE_KEY: OpRole.Optimize})
+
+            main_block.append_op(
+                type='elementwise_mod',
+                inputs={'X': current_step_var,
+                        'Y': acc_step_var},
+                outputs={'Out': current_step_var},
+                attrs={
+                    'axis': -1,
+                    OP_ROLE_KEY: OpRole.Optimize,
+                    'use_mkldnn': False
+                })
+
+            # cond_var = (step_var == 0)
+            main_block.append_op(
+                type='equal',
+                inputs={'X': current_step_var,
+                        'Y': zero_var},
+                outputs={'Out': cond_var},
+                attrs={OP_ROLE_KEY: OpRole.Optimize})
+        # paddle.static.Print(current_step_var, message="in FWBW last conditional")
+        return cond_var
+
+    def _true_apply_gradient(self):
+        """
+        allreduce grad@gradientmerge in dp group
+        grad@gradientmerge / acc_step
+        re-create all optimize ops of origin main block and rename them
+            cast(backward)
+            amp 
+            clip
+            opt
+        # fill constant grad@gradientmerge
+
+        """
+        # current conditional block
+        main_block = self._main_program.global_block()
+        cur_block_idx = self._main_program.current_block_idx
+        cur_block = self._main_program.current_block()
+        self.cond_block = self._main_program.current_block()
+
+        # cur_block's forward_block & backward_block is itself
+        cur_block._set_forward_block_idx(cur_block_idx)
+
+        # allreduce grad@gradientmerge  
+        if self.hybrid_dp:
+            assert self.dp_ring_id >= 0, "dp_ring_id should larger than 0 when in sharding&DP mode"
+            for grad, merged_grad in self._grad2merged_grad.items():
+                merged_grad_var = main_block.var(merged_grad)
+                cur_block.append_op(
+                    type='c_allreduce_sum',
+                    inputs={'X': merged_grad_var},
+                    outputs={'Out': merged_grad_var},
+                    attrs={
+                        'ring_id': self.dp_ring_id,
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize
+                    })
+
+        # grad@gradientmerge / acc_step
+        for grad, merged_grad in self._grad2merged_grad.items():
+            # grad /= k_steps
+            merged_grad_var = main_block.var(merged_grad)
+            cur_block.append_op(
+                type='scale',
+                inputs={'X': merged_grad_var},
+                outputs={'Out': merged_grad_var},
+                attrs={
+                    'scale': 1.0 / float(self._gradient_merge_acc_step),
+                    'bias': 0.0,
+                    'bias_after_scale': False,
+                    OP_ROLE_KEY: OpRole.Optimize
+                })
+
+        # re-create optimize ops
+        already_moved_var_names = []
+        for op_desc in self.original_optimize_ops_desc:
+            new_op_desc = cur_block.desc.append_op()
+            new_op_desc.copy_from(op_desc)
+
+            for input_name in new_op_desc.input_arg_names():
+                if input_name in self._grad2merged_grad:
+                    new_op_desc._rename_input(
+                        input_name, self._grad2merged_grad[input_name])
+
+            for output_name in new_op_desc.output_arg_names():
+                if output_name in self._grad2merged_grad:
+                    new_op_desc._rename_output(
+                        output_name, self._grad2merged_grad[output_name])
+
+                # move non temp optimize vars from block0 to cond block
+                if output_name not in already_moved_var_names and output_name not in self._grad2merged_grad.keys(
+                ):
+                    var_ = self._main_program.global_block().var(output_name)
+                    if not var_.persistable:
+                        # move
+                        name_ = var_.name
+                        shape_ = var_.shape
+                        type_ = var_.dtype
+                        self._main_program.global_block()._remove_var(
+                            var_.name, sync=False)
+                        self.cond_block.create_var(
+                            name=name_,
+                            shape=shape_,
+                            dtype=type_,
+                            persistable=False)
+                        already_moved_var_names.append(name_)
+
+        self._main_program.global_block()._sync_with_cpp()
+        cur_block._sync_with_cpp()
+
+        # fill zero to grad@gradientmerge
+        for grad, merged_grad in self._grad2merged_grad.items():
+            merged_grad_var = main_block.var(merged_grad)
+            cur_block.append_op(
+                type='fill_constant',
+                outputs={'Out': merged_grad_var},
+                attrs={
+                    "shape": merged_grad_var.shape,
+                    "dtype": merged_grad_var.dtype,
+                    "value": float(0),
+                    OP_ROLE_KEY: OpRole.Optimize
+                })
+
+        # lr_var = main_block.var("gradient_merge_current_step")
+        # paddle.static.Print(lr_var, message="in OPTIMIZE last conditional")
+
+    def _sharding_gradient_merge(self, main_block):
+        """
+        copy all optimize ops in origin main block
+        remove all optimize ops in origin main block
+        create cond block
+
+        """
+        # copy original optimize ops to temp ops desc list
+        # remove them from block 0
+        tmp_copy_block = self._main_program._create_block()
+
+        self.original_optimize_ops_desc = []
+        for op_idx, op in reversed(list(enumerate(main_block.ops))):
+            if int(op.attr('op_role')) != int(OpRole.Optimize):
+                continue
+            else:
+                tmp_op_desc = tmp_copy_block.desc.append_op()
+                tmp_op_desc.copy_from(op.desc)
+                self.original_optimize_ops_desc.append(tmp_op_desc)
+                main_block._remove_op(op_idx, sync=False)
+        tmp_copy_block._sync_with_cpp()
+        self.original_optimize_ops_desc = list(
+            reversed(self.original_optimize_ops_desc))
+
+        # back to block 0
+        self._main_program._rollback()
+
+        # create cond vars and ops at the end of block 0
+        cond = self._create_gm_cond(main_block)
+
+        # create cond block
+        cond_block = self._main_program._create_block()
+        self._true_apply_gradient()
+
+        # back to block 0
+        self._main_program._rollback()
+
+        # cond op
+        step_scope = self._main_program.global_block().create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        conditional_block_op = self._main_program.global_block().append_op(
+            type='conditional_block',
+            inputs={
+                'Cond': cond,
+                'Input': [],
+            },
+            outputs={'Out': [],
+                     'Scope': [step_scope]},
+            attrs={
+                'sub_block': cond_block,
+                'is_scalar_condition': True,
+            })
diff --git a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fbec7da0b5edfbe9b7b84bd8e41f1eb1f1ae098
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
@@ -0,0 +1,233 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from __future__ import print_function
+from __future__ import division
+
+import paddle.fluid as fluid
+from paddle.fluid import core, unique_name
+from .meta_optimizer_base import MetaOptimizerBase
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
+
+__all__ = []
+
+
+class TensorParallelOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(TensorParallelOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = [
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+            "LarsOptimizer",
+            "LambOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.mp_ring_id = 0
+        self.global_ring_id = 1
+        self.dp_ring_id = 2
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(TensorParallelOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        self.mp_degree = user_defined_strategy.tensor_parallel_configs[
+            'tensor_parallel_degree']
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if self.user_defined_strategy.tensor_parallel == True:
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.tensor_parallel = False
+        dist_strategy.tensor_parallel_configs = {}
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.tensor_parallel = True
+        dist_strategy.tensor_parallel_configs = {"tensor_parallel_degree": 1, }
+
+    def _broadcast_params(self, ring_id, mp_mode):
+        block = self.startup_program.global_block()
+        param = None
+        for param in block.iter_parameters():
+            if param.is_distributed and mp_mode:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        if not param: return  # no parameter on this device
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': param},
+            outputs={'Out': param},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+
+    def _get_process_group_info(self):
+        # global ring info
+        self.global_endpoints = self.endpoints
+        self.global_rank = self.rank
+        self.global_nranks = self.nranks
+
+        # model parallel ring info
+        self.mp_rank = self.rank % self.mp_degree
+        self.mp_nranks = self.mp_degree
+        mp_group = self.rank // self.mp_degree
+        self.mp_endpoints = [
+            self.endpoints[i] for i in range(self.global_nranks)
+            if i // self.mp_degree == mp_group
+        ]
+
+        # data parallel ring info
+        if self.nranks > self.mp_degree:
+            self.dp_rank = self.rank // self.mp_degree
+            self.dp_nranks = self.nranks // self.mp_degree
+            start_index = self.rank % self.mp_degree
+            self.dp_endpoints = [
+                self.endpoints[start_index + i * self.mp_degree]
+                for i in range(self.dp_nranks)
+            ]
+
+    def _init_process_group(self):
+        self._get_process_group_info()
+        collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
+
+        # Create global ring for all gpus
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.global_endpoints,
+            self.global_rank, self.global_ring_id, True, self.global_ring_id,
+            True)
+
+        # Create model parallel ring for all gpus
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.mp_endpoints,
+            self.mp_rank, self.mp_ring_id, True, self.global_ring_id, True)
+        #self._broadcast_params(self.mp_ring_id, mp_mode=True)
+
+        # Create dp rings
+        if self.nranks > self.mp_degree:
+            collective_helper._init_communicator(
+                self.startup_program, self.current_endpoint, self.dp_endpoints,
+                self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True)
+            self._broadcast_params(self.dp_ring_id, mp_mode=False)
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        self.endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
+        self.startup_program = startup_program
+        if startup_program is None:
+            self.startup_program = fluid.default_startup_program()
+
+        optimize_ops, params_grads = self.inner_opt.minimize(
+            loss, self.startup_program, parameter_list, no_grad_set)
+
+        self.main_program = loss.block.program
+        self.nranks = len(self.endpoints)
+        self.rank = self.role_maker._worker_index()
+
+        self._init_process_group()
+
+        assert self.nranks % self.mp_degree == 0
+
+        if self.nranks > self.mp_degree:
+            # data parallelism
+            dp_degree = self.nranks // self.mp_degree
+            self._transpile_main_program(loss, dp_degree)
+        return optimize_ops, params_grads
+
+    def _transpile_main_program(self, loss, dp_degree):
+        self._insert_loss_grad_ops(loss, dp_degree)
+        self._insert_allreduce_ops(loss, self.dp_ring_id)
+
+    def _insert_loss_grad_ops(self, loss, dp_degree):
+        """
+        In order to keep the learning rate consistent in different numbers of
+        training workers, we scale the loss grad by the number of workers
+        """
+        block = loss.block
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_loss_grad_op(op):
+                loss_grad_var = block.vars[op.output_arg_names[0]]
+                block._insert_op(
+                    idx + 1,
+                    type='scale',
+                    inputs={'X': loss_grad_var},
+                    outputs={'Out': loss_grad_var},
+                    attrs={
+                        'scale': 1.0 / dp_degree,
+                        OP_ROLE_KEY: OpRole.Backward
+                    })
+                break
+
+    def _insert_allreduce_ops(self, loss, ring_id):
+        block = loss.block
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.attr(OP_ROLE_VAR_KEY)
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    grad = block.vars[op_role_var[i + 1]]
+                    if offset == idx:
+                        offset += 1
+                        block._insert_op(
+                            offset,
+                            type='c_sync_calc_stream',
+                            inputs={'X': grad},
+                            outputs={'Out': grad},
+                            attrs={OP_ROLE_KEY: OpRole.Backward})
+                        offset += 1
+
+                    block._insert_op(
+                        offset,
+                        type='c_allreduce_sum',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+        for idx, op in list(enumerate(block.ops)):
+            if is_optimizer_op(op):
+                block._insert_op(
+                    idx,
+                    type='c_sync_comm_stream',
+                    inputs={'X': grad},
+                    outputs={'Out': grad},
+                    attrs={'ring_id': ring_id,
+                           OP_ROLE_KEY: OpRole.Backward})
+                break
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed74d8e744e50d0533bce09d71947107a15f2296
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .parallel_layers import VocabParallelEmbedding  # noqa: F401
+from .parallel_layers import ColumnParallelLinear  # noqa: F401
+from .parallel_layers import RowParallelLinear  # noqa: F401
+from .parallel_layers import LayerDesc  # noqa: F401
+from .parallel_layers import PipelineLayer  # noqa: F401
+from .parallel_layers import RNGStatesTracker  # noqa: F401
+from .parallel_layers import model_parallel_random_seed  # noqa: F401
+from .parallel_layers import get_rng_state_tracker  # noqa: F401
+from .model_parallel import ModelParallel  # noqa: F401
+from .pipeline_parallel import PipelineParallel  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..69e41ab0edab2d2b3f64f54343a6132982ae20da
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.layers import Layer
+
+__all__ = []
+
+
+class MetaParallelBase(Layer):
+    def __init__(self, layers, hcg, strategy):
+        super(MetaParallelBase,
+              self).__init__(layers.full_name() + "_meta_parallel_base")
+        self._layers = layers
+        self._hcg = hcg
+        self._strategy = strategy
+        self._prepare_for_model()
+
+    def _prepare_for_model(self):
+        pass
+
+    def _pre_forward(self, *inputs, **kwargs):
+        pass
+
+    def forward(self, *inputs, **kwargs):
+        self._pre_forward(*inputs, **kwargs)
+
+        output = self._layers(*inputs, **kwargs)
+
+        self._post_forward(output)
+
+        return output
+
+    def _post_forward(self, output):
+        pass
diff --git a/python/paddle/distributed/fleet/meta_parallel/model_parallel.py b/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..682d7152a42bd2f2d20e759d4ef5ddb3f0edec67
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
@@ -0,0 +1,40 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.layers import Layer
+from .meta_parallel_base import MetaParallelBase
+from ..utils.hybrid_parallel_util import broadcast_dp_parameters
+from ..utils.hybrid_parallel_util import broadcast_input_data
+from ..utils.hybrid_parallel_util import broadcast_mp_parameters
+from ..utils.log_util import logger
+
+__all__ = []
+
+
+class ModelParallel(MetaParallelBase):
+    def __init__(self, layers, hcg, **kwargs):
+        super(ModelParallel, self).__init__(layers, hcg, **kwargs)
+
+    def _prepare_for_model(self):
+        logger.info("start broadcast mp parameters")
+        broadcast_mp_parameters(self._layers, self._hcg)
+
+        logger.info("start broadcast mp parameters")
+        broadcast_dp_parameters(self._layers, self._hcg)
+
+        logger.info("mp's parameters is ready")
+
+    def _pre_forward(self, *inputs, **kwargs):
+        logger.debug("mp start broadcast input data")
+        return broadcast_input_data(self._hcg, *inputs, **kwargs)
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a33611403ace0f3ceefeea8b108c6ff8fa2d885
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mp_layers import VocabParallelEmbedding  # noqa: F401
+from .mp_layers import ColumnParallelLinear  # noqa: F401
+from .mp_layers import RowParallelLinear  # noqa: F401
+from .pp_layers import LayerDesc  # noqa: F401
+from .pp_layers import PipelineLayer  # noqa: F401
+from .random import RNGStatesTracker  # noqa: F401
+from .random import model_parallel_random_seed  # noqa: F401
+from .random import get_rng_state_tracker  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..af59b16e22aa858e3b1e244c0bf7169957dfa2fd
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid.dygraph.layers import Layer
+from .random import get_rng_state_tracker
+from paddle.nn import functional as F
+from paddle import framework
+from ...base import topology as tp
+
+__all__ = []
+
+# Follow this paper to achieve the file:
+# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter 
+# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
+
+
+class VocabParallelEmbedding(Layer):
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 weight_attr=None,
+                 name=None):
+        super(VocabParallelEmbedding, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+
+        self.origin_num_embeddings = num_embeddings
+
+        per_part_size = (
+            num_embeddings + self.world_size - 1) // self.world_size
+        last_part_size = num_embeddings - per_part_size * (self.world_size - 1)
+        if self.rank == self.world_size - 1:
+            per_part_size = last_part_size
+        per_part_size += 1  # make the last row as the padding index
+        self.per_part_size = per_part_size
+
+        self.embedding = paddle.nn.Embedding(
+            per_part_size,
+            embedding_dim,
+            padding_idx=per_part_size - 1,
+            sparse=False,
+            weight_attr=weight_attr,
+            name=name)
+        self.embedding.weight.is_distributed = True
+
+    def forward(self, x):
+        origin_input_shape = x.shape
+        if len(origin_input_shape) == 2:
+            x = paddle.unsqueeze(x, axis=-1)
+        else:
+            assert origin_input_shape[-1] == 1, (
+                "The last dimension size of x must be 1.")
+        x_shard = paddle.shard_index(x, self.origin_num_embeddings,
+                                     self.world_size, self.rank,
+                                     self.per_part_size - 1)
+        if len(origin_input_shape) == 2:
+            x_shard = paddle.squeeze(x_shard, axis=-1)
+
+        emb_out = self.embedding(x_shard)
+        if self.world_size > 1:
+            emb_out = paddle.distributed.collective._mp_allreduce(
+                emb_out,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True)
+        return emb_out
+
+
+class ColumnParallelLinear(Layer):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=None,
+                 gather_output=True,
+                 name=None):
+        super(ColumnParallelLinear, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+
+        self.name = name
+        self.gather_output = gather_output
+        assert out_features % self.world_size == 0, (
+            "Number of column of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(out_features,
+                                                            self.world_size))
+        self.output_size_per_partition = out_features // self.world_size
+
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+
+        self.weight = self.create_parameter(
+            shape=[in_features, self.output_size_per_partition],
+            attr=self._weight_attr,
+            dtype=self._dtype)
+        self.weight.is_distributed = True
+
+        if has_bias:
+            # initialize bias to zero like Megatron
+            self.bias = self.create_parameter(
+                shape=[self.output_size_per_partition],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype)
+            self.bias.is_distributed = True
+        else:
+            self.bias = None
+
+    def forward(self, x):
+        # use inner api to process identity
+        input_parallel = paddle.distributed.collective._c_identity(
+            x, group=self.model_parallel_group)
+        output_parallel = F.linear(
+            input_parallel, self.weight, self.bias, name=self.name)
+        if self.gather_output:
+            output = paddle.distributed.collective._c_concat(
+                output_parallel,
+                nranks=self.world_size,
+                group=self.model_parallel_group)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(Layer):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=True,
+                 input_is_parallel=False,
+                 name=None):
+        super(RowParallelLinear, self).__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.input_is_parallel = input_is_parallel
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+        self.name = name
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+
+        assert in_features % self.world_size == 0, (
+            "Number of row of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(in_features,
+                                                            self.world_size))
+
+        self.input_size_per_partition = in_features // self.world_size
+
+        self.weight = self.create_parameter(
+            shape=[self.input_size_per_partition, self.out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype)
+        self.weight.is_distributed = True
+
+        if has_bias:
+            self.bias = self.create_parameter(
+                shape=[self.out_features],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype)
+        else:
+            self.bias = None
+
+    def forward(self, x):
+        if self.input_is_parallel:
+            input_parallel = x
+        else:
+            # split last dim
+            input_parallel = paddle.distributed.collective._c_split(
+                x,
+                rank=self.rank,
+                nranks=self.world_size,
+                group=self.model_parallel_group)
+
+        output_parallel = F.linear(input_parallel, self.weight, name=self.name)
+        output_ = paddle.distributed.collective._mp_allreduce(
+            output_parallel,
+            group=self.model_parallel_group,
+            use_calc_stream=True,
+            use_model_parallel=True)
+
+        output = output_ + self.bias if self.bias is not None else output_
+        return output
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..77be62ae6cf4b464b6bdc8a1e72f0b2f9c414e6c
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -0,0 +1,155 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import paddle
+from paddle.fluid.dygraph.layers import Layer
+from ...utils.log_util import logger, layer_to_str
+
+__all__ = []
+
+
+class SegmentLayers(object):
+    def __init__(self, layers_desc, num_parts, method="uniform"):
+        self._layers_desc = layers_desc
+        self.method = method
+        self.num_parts = num_parts
+        self.num_items = len(layers_desc)
+        assert self.num_items >= self.num_parts, "layer number should be greater than number of segments"
+
+    def do_segment(self):
+        if self.method == "uniform":
+            return self.uniform(self.num_items, self.num_parts)
+
+    def uniform(self, num_items, num_parts):
+        result = [0 for _ in range(num_parts + 1)]
+        part_size = math.floor(num_items / num_parts)
+        for i in range(num_parts):
+            result[i] = int(min(part_size * i, num_items))
+        result[num_parts] = num_items
+        return result
+
+
+class LayerDesc(object):
+    def __init__(self, layer_func, *inputs, **kwargs):
+        self.layer_func = layer_func
+        self.inputs = inputs
+        self.kwargs = kwargs
+
+        if not issubclass(layer_func, Layer):
+            raise TypeError(
+                "The input(layer_func) should be a derived class of Layer.")
+
+    def build_layer(self):
+        return self.layer_func(*self.inputs, **self.kwargs)
+
+    def __repr__(self):
+        return layer_to_str(self.layer_func.__name__, *self.inputs,
+                            **self.kwargs)
+
+
+class PipelineLayer(Layer):
+    def __init__(self,
+                 layers,
+                 num_stages=None,
+                 topology=None,
+                 loss_fn=None,
+                 seg_method="uniform"):
+        super(PipelineLayer, self).__init__()
+        if num_stages is None and topology is None:
+            raise ValueError("should provide num_stages or topology")
+
+        # lazy import
+        import paddle.distributed as dist
+        from paddle.distributed import fleet
+
+        self.device_id = dist.ParallelEnv().device_id
+        self.layers = layers
+        self._loss_fn = loss_fn
+        self._topo = topology
+        world_size = dist.get_world_size()
+        self.global_rank = dist.get_rank()
+
+        if self._topo:
+            self._stage_id = self._topo.get_coord(self.global_rank).pipe
+            self._num_stages = self._topo.get_dim_size("pipe")
+            if num_stages:
+                assert self._num_stages == num_stages, "num_stages should be equal to be %d" % (
+                    self._num_stages)
+        else:
+            # construct default topology
+            if world_size % num_stages != 0:
+                raise ValueError("should provide correct num_stages({}) "
+                                 "which can be divided by world_size({})".
+                                 format(num_stages, world_size))
+            dp_num = world_size // num_stages
+            self._topo = fleet.CommunicateTopology(["data", "pipe", "model"],
+                                                   [dp_num, num_stages, 1])
+            self._stage_id = self._topo.get_coord(self.global_rank).pipe
+            self._num_stages = self._topo.get_dim_size("pipe")
+
+        # initialize segment
+        self._layers_desc = list(self.layers)
+        self._num_layers = len(self._layers_desc)
+        self._start_pos = 0
+        self._end_pos = self._num_layers - 1
+        self._segment_network(seg_method)
+
+        # construct layer
+        self.run_function = []
+        self._build_layer()
+
+    def _segment_network(self, seg_method):
+        logger.info("start segment network..")
+        seg = SegmentLayers(
+            self._layers_desc, num_parts=self._num_stages, method=seg_method)
+        self.segment_parts = seg.do_segment()
+
+        self._start_pos = self.segment_parts[self._stage_id]
+        self._end_pos = self.segment_parts[self._stage_id + 1]
+
+        # print information for debug
+        for stage in range(self._num_stages):
+            start = self.segment_parts[stage]
+            end = self.segment_parts[stage + 1]
+            logger.info("stage={}, global_rank={} ,layer_number={}".format(
+                stage, self.global_rank, end - start))
+
+            for index, layer in enumerate(self._layers_desc[start:end]):
+                logger.info("{}: {}".format(index + start, str(layer)))
+
+        if self._loss_fn:
+            try:
+                logger.info("loss: {}".format(self._loss_fn.__name__))
+            except AttributeError:
+                logger.info("loss: {}".format(self._loss_fn.__class__.__name__))
+
+    def _build_layer(self):
+        start = self._start_pos
+        end = self._end_pos
+        for index, layer in enumerate(self._layers_desc[start:end]):
+            layer_index = start + index
+            if isinstance(layer, Layer):
+                self.run_function.append(layer)
+                self.add_sublayer(str(layer_index), layer)
+            elif isinstance(layer, LayerDesc):
+                model = layer.build_layer()
+                self.run_function.append(model)
+                self.add_sublayer(str(layer_index), model)
+            else:
+                self.run_function.append(layer)
+
+    def forward(self, input):
+        for layer in self.run_function:
+            input = layer(input)
+        return input
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
new file mode 100644
index 0000000000000000000000000000000000000000..41c9deabd1e11b92d0c0b522130e8fc82a05f095
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import contextlib
+
+__all__ = []
+
+MODEL_PARALLEL_RNG = 'model_parallel_rng'
+
+
+class RNGStatesTracker:
+    """
+    Tracker the RNG states.
+    """
+
+    def __init__(self):
+        # Map from name to the rng state.
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def reset(self):
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def add(self, name, seed):
+        if seed in self.seeds_:
+            raise ValueError('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        if name in self.states_:
+            raise ValueError('state {} already exists'.format(name))
+        orig_rng_state = paddle.get_cuda_rng_state()
+        paddle.seed(seed)
+        self.states_[name] = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(orig_rng_state)
+
+    @contextlib.contextmanager
+    def rng_state(self, name=MODEL_PARALLEL_RNG):
+        if name not in self.states_:
+            raise ValueError('state {} does not exist'.format(name))
+        orig_cuda_rng_state = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(self.states_[name])
+        try:
+            yield
+        finally:
+            self.states_[name] = paddle.get_cuda_rng_state()
+            paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+RNG_STATE_TRACKER = RNGStatesTracker()
+
+
+def get_rng_state_tracker():
+    return RNG_STATE_TRACKER
+
+
+def model_parallel_random_seed(seed=2048):
+    import paddle.distributed.fleet as fleet
+    hcg = fleet.get_hybrid_communicate_group()
+    rank = hcg.get_model_parallel_rank()
+
+    local_seed = seed + 1024 + rank
+    global_seed = seed
+
+    RNG_STATE_TRACKER.reset()
+    paddle.seed(global_seed)
+    RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed)
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..79e5bc2ffeda06d62b24aec2e10ae3ad071d856a
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -0,0 +1,498 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import time
+import copy
+import os
+
+from types import MethodType
+
+from numpy import prod
+
+import paddle
+import paddle.fluid as fluid
+from .meta_parallel_base import MetaParallelBase
+from .pp_utils.utils import get_tensor_bytes, is_float_tensor
+from .pp_utils import utils
+from .parallel_layers.pp_layers import PipelineLayer
+
+from ..utils.hybrid_parallel_util import broadcast_mp_parameters
+from ..utils.hybrid_parallel_util import broadcast_dp_parameters
+from ..utils.hybrid_parallel_util import fused_allreduce_gradients
+from ..utils.log_util import logger
+
+__all__ = []
+
+FLOAT_TYPES = [
+    paddle.float16,
+    paddle.float32,
+    paddle.float64,
+]
+
+
+class PipelineParallel(MetaParallelBase):
+    def __init__(self, layers, hcg, strategy):
+        super(PipelineParallel, self).__init__(layers, hcg, strategy)
+
+        self.use_pipe_parallel = self._hcg.get_pipe_parallel_world_size() > 1
+        self.use_data_parallel = self._hcg.get_data_parallel_world_size() > 1
+        self.use_model_parallel = self._hcg.get_model_parallel_world_size() > 1
+
+        self.num_caches = 0
+        self.caches = {
+            'inputs': [],
+            'labels': [],
+            'outputs': [],
+        }
+
+        self.recv_cache = None
+        self.grad_tensors = None
+
+        self.send_meta = True
+
+        self.current_loss = paddle.to_tensor(0.0)
+        self.total_loss = None
+
+        self.use_amp = self._strategy.amp
+        self.init_loss_scaling = self._strategy.amp_configs['init_loss_scaling']
+        self.micro_batch_size = self._strategy.pipeline_configs[
+            'micro_batch_size']
+        self.accumulate_steps = self._strategy.pipeline_configs[
+            'accumulate_steps']
+
+        self.num_stages = self._hcg.get_pipe_parallel_world_size()
+        self.stage_id = self._hcg.get_stage_id()
+        self.prev_stage_id = self.stage_id - 1
+        self.next_stage_id = self.stage_id + 1
+        self.pp_group = self._hcg.get_pipe_parallel_group()
+        logger.info("Pipeline Info -- num_stages: {}, stage_id: {}".format(
+            self.num_stages, self.stage_id))
+
+        if self.use_model_parallel:
+            logger.info("start broadcast mp parameters")
+            broadcast_mp_parameters(self._layers, self._hcg)
+
+        if self.use_data_parallel:
+            logger.info("start broadcast mp parameters")
+            broadcast_dp_parameters(self._layers, self._hcg)
+
+    def _allocate_caches(self, num_caches):
+        if self.num_caches >= num_caches:
+            return
+
+        num = num_caches - self.num_caches
+        self.num_caches = num_caches
+        for key in self.caches:
+            self.caches[key].extend([None] * num)
+
+    def train_batch(self, data, optimizer):
+        self.optimizer = optimizer
+        assert fluid.framework._dygraph_tracer()._has_grad, (
+            'Please enable the generation of gradients.')
+
+        if self.stage_id == 0 or self.stage_id == self.num_stages - 1:
+            assert data, (
+                "For the first and the last stage, the data_iter must be set.")
+        else:
+            assert data is None, (
+                "For pipe stages other than the first and the last one, "
+                "the data_iter must be None.")
+        self.data = data
+        self._layers.train()
+        self.total_loss = None
+
+        minibatch_cmds = utils.TrainGenerator(self.accumulate_steps,
+                                              self.num_stages, self.stage_id)
+        self._train(minibatch_cmds)
+        return self.total_loss
+
+    def _train(self, minibatch_cmds):
+        self._allocate_caches(self.accumulate_steps)
+        for micro_cmds in minibatch_cmds:
+            for cmd in micro_cmds:
+                assert type(cmd) in self._COMMAND_MAP, "unknow cmd: {}".format(
+                    type(cmd))
+                self._apply_cmd = MethodType(self._COMMAND_MAP[type(cmd)], self)
+                self._apply_cmd(**cmd.kwargs)
+
+    def _allreduce_grads(self):
+        if not self.use_data_parallel: return
+        fused_allreduce_gradients(list(self._layers.parameters()), self._hcg)
+
+    def _forward(self, cache_id):
+        # load data
+        self._load_micro_batch(cache_id)
+        if self.stage_id != 0:
+            self._recv_activations(cache_id)
+
+        if isinstance(self.caches['inputs'][cache_id], tuple):
+            inputs = tuple(t for t in self.caches['inputs'][cache_id])
+        else:
+            inputs = self.caches['inputs'][cache_id]
+
+        self._clear_grads(inputs)
+        outputs = self._layers.forward(inputs)
+        self.caches['outputs'][cache_id] = outputs
+
+        if self.stage_id == self.num_stages - 1:
+            if self._layers._loss_fn is not None:
+                labels = self.caches['labels'][cache_id]
+                outputs = self._layers._loss_fn(outputs, labels)
+
+        if self.stage_id == self.num_stages - 1:
+            self.current_loss = outputs
+            if isinstance(self.current_loss, paddle.Tensor):
+                if self.total_loss is None:
+                    self.total_loss = paddle.zeros_like(self.current_loss)
+                self.total_loss += self.current_loss.detach()
+            else:
+                if self.total_loss is None:
+                    self.total_loss = [
+                        paddle.zeros_like(v) for v in self.current_loss
+                    ]
+                for idx, v in enumerate(self.current_loss):
+                    self.total_loss[idx] += v.detach()
+            if self.use_data_parallel:
+                self.current_loss = self.current_loss / self._hcg.get_data_parallel_world_size(
+                )
+            if self.accumulate_steps > 1:
+                self.current_loss = self.current_loss / self.accumulate_steps
+            self.caches['outputs'][cache_id] = self.current_loss.clone()
+        else:
+            self._send_activations(cache_id)
+
+    def _backward(self, cache_id):
+        assert self.optimizer is not None
+        if self.stage_id == self.num_stages - 1:
+            paddle.autograd.backward(self.caches['outputs'][cache_id])
+            self._send_gradients(cache_id)
+            return
+        self._recv_gradients(cache_id)
+
+        outputs = self.caches['outputs'][cache_id]
+
+        grad_tensors = self.grad_tensors
+        if isinstance(outputs, tuple):
+            out_tensors = [t for t in outputs if is_float_tensor(t)]
+            assert len(out_tensors) == len(grad_tensors)
+            paddle.autograd.backward(
+                tensors=out_tensors, grad_tensors=grad_tensors)
+        else:
+            paddle.autograd.backward(
+                tensors=[outputs], grad_tensors=[grad_tensors])
+
+        grad_tensors = None
+        if self.stage_id != 0: self._send_gradients(cache_id)
+        self.caches['outputs'][cache_id] = None
+        #self.caches['backward_tensors'][cache_id] = None
+
+    def _get_data(self):
+        if self.use_model_parallel:
+            mp_rank = self._hcg.get_model_parallel_rank()
+        else:
+            mp_rank = 0
+
+        # mp rank 0 loads the data and broadcat it to others.
+        data = self.data
+        if self.use_model_parallel and (self.stage_id == 0 or
+                                        self.stage_id == self.num_stages - 1):
+            assert isinstance(data, (tuple, paddle.Tensor))
+            if isinstance(data, paddle.Tensor):
+                paddle.distributed.broadcast(
+                    data,
+                    src=self._hcg.get_model_parallel_group_src_rank(),
+                    group=self._hcg.get_model_parallel_group())
+            else:
+                data = []
+                for d in self.data:
+                    assert isinstance(d, paddle.Tensor)
+                    paddle.distributed.broadcast(
+                        d,
+                        src=self._hcg.get_model_parallel_group_src_rank(),
+                        group=self._hcg.get_model_parallel_group())
+                    data.append(d)
+            data = tuple(data)
+        return data
+
+    def _load_micro_batch(self, cache_id):
+        inputs = self._get_data()
+
+        if self.stage_id == 0:
+            data = None
+            #if isinstance(inputs[0], paddle.Tensor):
+            if len(inputs) == 1:
+                assert isinstance(inputs[0], paddle.Tensor)
+                data = inputs[0].clone().detach()
+                #data.stop_gradient = not is_float_tensor(data)
+                data.stop_gradient = True
+            else:
+                assert isinstance(inputs, tuple)
+                data = []
+                for d in inputs:
+                    assert isinstance(d, paddle.Tensor)
+                    i = d.clone().detach()
+                    #i.stop_gradient = not is_float_tensor(i)
+                    i.stop_gradient = True
+                    data.append(i)
+                data = tuple(data)
+            self.caches['inputs'][cache_id] = data
+
+        if self.stage_id == self.num_stages - 1:
+            labels = None
+            #if isinstance(inputs[1], paddle.Tensor):
+            if len(inputs) == 1:
+                assert isinstance(inputs[0], paddle.Tensor)
+                labels = inputs[0]
+            elif isinstance(inputs, tuple):
+                labels = []
+                for label in inputs:
+                    assert isinstance(label, paddle.Tensor)
+                    label = label.detach()
+                    labels.append(label)
+                labels = tuple(labels)
+            self.caches['labels'][cache_id] = labels
+
+    def _send_meta(self, data, peer):
+        """
+        % type (0: tensor, 1: tuple)
+        % num_tensors if type=tuple
+        foreach tensor:
+          % ndims
+          % shape
+        """
+        if isinstance(data, paddle.Tensor):
+            tensor_type = paddle.to_tensor([0])
+            paddle.distributed.send(
+                tensor_type, peer, use_calc_stream=True, group=self.pp_group)
+            dims = paddle.to_tensor(len(data.shape))
+            paddle.distributed.send(
+                dims, peer, use_calc_stream=True, group=self.pp_group)
+            shape = paddle.to_tensor(data.shape)
+            paddle.distributed.send(
+                shape, peer, use_calc_stream=True, group=self.pp_group)
+        elif isinstance(data, tuple):
+            tensor_type = paddle.to_tensor([1])
+            paddle.distributed.send(
+                tensor_type, peer, use_calc_stream=True, group=self.pp_group)
+            nums = paddle.to_tensor(len(data))
+            paddle.distributed.send(
+                nums, peer, use_calc_stream=True, group=self.pp_group)
+            for idx, d in enumerate(data):
+                assert isinstance(d, paddle.Tensor)
+                dims = paddle.to_tensor(len(d.shape))
+                paddle.distributed.send(
+                    dims, peer, use_calc_stream=True, group=self.pp_group)
+                shape = paddle.to_tensor(d.shape)
+                paddle.distributed.send(
+                    shape, peer, use_calc_stream=True, group=self.pp_group)
+
+    def _recv_meta(self, peer):
+        tensor_type = paddle.to_tensor([0])
+        paddle.distributed.recv(
+            tensor_type, peer, use_calc_stream=True, group=self.pp_group)
+        tensor_type = tensor_type.numpy()[0]
+
+        if tensor_type == 0:
+            dims = paddle.to_tensor([0])
+            paddle.distributed.recv(
+                dims, peer, use_calc_stream=True, group=self.pp_group)
+            dims = dims.numpy()[0]
+            shape = paddle.to_tensor([0] * dims)
+            paddle.distributed.recv(
+                shape, peer, use_calc_stream=True, group=self.pp_group)
+            shape = shape.numpy().tolist()
+            return self._allocate_buffer(
+                shape, dtype="float32", num_caches=1)[0]
+        elif tensor_type == 1:
+            num = paddle.to_tensor([0])
+            paddle.distributed.recv(
+                num, peer, use_calc_stream=True, group=self.pp_group)
+            num = num.numpy()[0]
+            shapes = []
+            for i in range(num):
+                dims = paddle.to_tensor([0])
+                paddle.distributed.recv(
+                    dims, peer, use_calc_stream=True, group=self.pp_group)
+                dims = dims.numpy()[0]
+                shape = paddle.to_tensor([0] * dims)
+                paddle.distributed.recv(
+                    shape, peer, use_calc_stream=True, group=self.pp_group)
+                shapes.append(shape.numpy().tolist())
+
+            dtypes = ["float32"] * len(shapes)
+            caches = self._allocate_buffers(shapes, dtypes, num_caches=1)[0]
+            caches = tuple(caches)
+            return caches
+
+    def _send_activations(self, cache_id):
+        outputs = self.caches['outputs'][cache_id]
+
+        if self.send_meta:
+            self.send_meta = False
+            self._send_meta(outputs, self.next_stage_id)
+
+        if isinstance(outputs, paddle.Tensor):
+            paddle.distributed.send(
+                outputs,
+                self.next_stage_id,
+                use_calc_stream=True,
+                group=self.pp_group)
+        elif isinstance(outputs, tuple):
+            for output in outputs:
+                paddle.distributed.send(
+                    output,
+                    self.next_stage_id,
+                    use_calc_stream=True,
+                    group=self.pp_group)
+
+    def _send_gradients(self, cache_id):
+        inputs = self.caches['inputs'][cache_id]
+
+        if isinstance(inputs, paddle.Tensor):
+            assert inputs.grad is not None
+            paddle.distributed.send(
+                paddle.to_tensor(inputs.grad),
+                self.prev_stage_id,
+                use_calc_stream=True,
+                group=self.pp_group)
+        else:
+            for idx, d in enumerate(inputs):
+                # Skip tensors that will not produce a grad
+                if not is_float_tensor(d):
+                    assert d.grad is None
+                    continue
+                assert d.grad is not None
+                paddle.distributed.send(
+                    d.grad,
+                    self.prev_stage_id,
+                    use_calc_stream=True,
+                    group=self.pp_group)
+        self.caches['inputs'][cache_id] = None
+
+    def _recv_activations(self, cache_id):
+        inputs = None
+
+        # Allocate the buffer if necessary
+        if self.recv_cache is None:
+            self.recv_cache = self._recv_meta(self.prev_stage_id)
+
+        if isinstance(self.recv_cache, paddle.Tensor):
+            paddle.distributed.recv(
+                self.recv_cache,
+                self.prev_stage_id,
+                use_calc_stream=True,
+                group=self.pp_group)
+            inputs = self.recv_cache.clone().detach()
+            inputs.stop_gradient = not is_float_tensor(inputs)
+        else:
+            assert isinstance(self.recv_cache, tuple)
+            inputs = [None] * len(self.recv_cache)
+            for idx, d in enumerate(self.recv_cache):
+                assert isinstance(d, paddle.Tensor)
+
+                paddle.distributed.recv(
+                    d,
+                    self.prev_stage_id,
+                    use_calc_stream=True,
+                    group=self.pp_group)
+                inputs[idx] = d.clone().detach()
+
+            inputs = tuple(inputs)
+
+            for d in inputs:
+                d.stop_gradient = not is_float_tensor(d)
+
+        self.caches['inputs'][cache_id] = inputs
+
+    def _recv_gradients(self, cache_id):
+        outputs = self.caches['outputs'][cache_id]
+        if self.grad_tensors is None:
+            if isinstance(outputs, paddle.Tensor):
+                s = list(outputs.shape)
+                dtype = 'float16' if self.use_amp else "float32"
+                self.grad_tensors = self._allocate_buffer(
+                    s, dtype, num_buffers=1)[0]
+            else:
+                sizes = [list(d.shape) for d in outputs if is_float_tensor(d)]
+                dtypes = ['float16'] * len(
+                    sizes) if self.use_amp else ['float32'] * len(sizes)
+                self.grad_tensors = self._allocate_buffers(
+                    sizes, dtypes, num_caches=1)[0]
+
+        if isinstance(self.grad_tensors, paddle.Tensor):
+            paddle.distributed.recv(
+                self.grad_tensors,
+                self.next_stage_id,
+                use_calc_stream=True,
+                group=self.pp_group)
+        else:
+            assert isinstance(outputs, tuple)
+            for d in self.grad_tensors:
+                paddle.distributed.recv(
+                    d,
+                    self.next_stage_id,
+                    use_calc_stream=True,
+                    group=self.pp_group)
+
+    def _step(self):
+        self._allreduce_grads()
+        self.optimizer.step()
+        self.optimizer.clear_gradients()
+
+    def _clear_grads(self, inputs):
+        if isinstance(inputs, paddle.Tensor):
+            if inputs.grad is not None:
+                inputs.clear_gradient()
+        else:
+            for d in inputs:
+                if d.grad is not None:
+                    d.clear_gradient()
+
+    def _allocate_zeros(self, shape, dtype):
+        return paddle.zeros(shape, dtype)
+
+    def _allocate_buffer(self, shape, dtype, num_caches=-1):
+        caches = []
+        if num_caches == -1:
+            num_caches = self.num_caches
+        for count in range(num_caches):
+            caches.append(self._allocate_zeros(shape, dtype))
+        return caches
+
+    def _allocate_buffers(self, shapes, dtypes, num_caches=-1):
+        caches = []
+        if num_caches == -1:
+            num_caches = self.num_caches
+        for count in range(num_caches):
+            cache = []
+            for shape, dtype in zip(shapes, dtypes):
+                cache.append(self._allocate_zeros(shape, dtype))
+            caches.append(cache)
+        return caches
+
+    def save_state_dict(self, model_path):
+        state_dict = self._layers.state_dict()
+        paddle.save(state_dict, model_path)
+
+    def load_state_dict(self, model_path):
+        state_dict = paddle.load(self.model_path)
+        self._layers.set_state_dict(state_dict)
+
+    _COMMAND_MAP = {
+        utils.Optimize: _step,
+        utils.Forward: _forward,
+        utils.Backward: _backward,
+    }
+
+    def forward(self, *inputs, **kwargs):
+        raise RuntimeError("Call train_batch for pipeline instead of forward.")
diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
similarity index 76%
rename from python/paddle/fluid/contrib/reader/__init__.py
rename to python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
index 32054d1421a27e7e73656e06555eec20e5ed0ea6..786eb20487a52e884db35795e006681d513d0b1c 100644
--- a/python/paddle/fluid/contrib/reader/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-from .distributed_reader import *
+from .utils import get_tensor_bytes
 
 __all__ = []
-__all__ += distributed_reader.__all__
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5c5709f98d9577d742ee7eabac259459eef79b1
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import paddle
+from ...utils import hybrid_parallel_util as hp_util
+
+__all__ = []
+
+FLOAT_TYPES = [
+    paddle.float16,
+    paddle.float32,
+    paddle.float64,
+]
+
+
+def is_float_tensor(tensor):
+    """Is a float tensor"""
+    return tensor.dtype in FLOAT_TYPES
+
+
+def get_tensor_bytes(tensor):
+    """Get the bytes a tensor occupied."""
+    elem_size = None
+    if tensor.dtype == paddle.float32:
+        elem_size = 4
+    elif tensor.dtype == paddle.float64:
+        elem_size = 8
+    elif tensor.dtype == paddle.int64:
+        elem_size = 8
+    elif tensor.dtype == paddle.int32:
+        elem_size = 4
+    elif tensor.dtype == paddle.float16:
+        elem_size = 2
+    elif tensor.dtype == paddle.int8:
+        elem_size = 1
+    else:
+        raise ValueError("unknown data type: {}".format(tensor.dtype))
+    return tensor.numel() * elem_size
+
+
+class Generator():
+    def __init__(self, micro_batches, stages, stage_id):
+        __metaclass__ = abc.ABCMeta
+
+        self.micro_batches = micro_batches
+        self.stages = stages
+        self.stage_id = stage_id
+        self.prev_stage = self.stage_id - 1
+        self.next_stage = self.stage_id + 1
+
+    @abc.abstractmethod
+    def generate(self):
+        pass
+
+    def __iter__(self):
+        self.iter = None
+        return self
+
+    def __next__(self):
+        if self.iter is None:
+            self.iter = self.generate()
+        return next(self.iter)
+
+
+class TrainGenerator(Generator):
+    def generate(self):
+        startup_steps = self.stages - self.stage_id - 1
+        cmds = []
+        forward_steps = 0
+        backward_steps = 0
+        #while (forward_steps < startup_steps):
+        #    cmds.append(Forward(cache_id=forward_steps))
+        #    forward_steps += 1
+        #while (forward_steps < self.micro_batches):
+        #    cmds.append(Forward(cache_id=forward_steps))
+        #    forward_steps += 1
+        #    cmds.append(Backward(cache_id=backward_steps))
+        #    backward_steps += 1
+        #while (backward_steps < self.micro_batches):
+        #    cmds.append(Backward(cache_id=backward_steps))
+        #    backward_steps += 1
+        #cmds.append(Optimize())
+        while (forward_steps < self.micro_batches):
+            cmds.append(Forward(cache_id=forward_steps))
+            forward_steps += 1
+        while (backward_steps < self.micro_batches):
+            cmds.append(Backward(cache_id=backward_steps))
+            backward_steps += 1
+        cmds.append(Optimize())
+        yield cmds
+
+
+class Command:
+    def __init__(self, **kwargs):
+        self.name = self.__class__.__name__
+        self.kwargs = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+    def __repr__(self):
+        return hp_util.call_to_str(self.name, **self.kwargs)
+
+
+class Optimize(Command):
+    pass
+
+
+class Forward(Command):
+    pass
+
+
+class Backward(Command):
+    pass
diff --git a/python/paddle/distributed/fleet/metrics/__init__.py b/python/paddle/distributed/fleet/metrics/__init__.py
index bc30c063787d28e5bcb4455b3cbd56372879fe0a..abcb90afb23c43d5cd2ec00cadb14a296973b84d 100644
--- a/python/paddle/distributed/fleet/metrics/__init__.py
+++ b/python/paddle/distributed/fleet/metrics/__init__.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .metric import *
+from .metric import acc  # noqa: F401
+from .metric import auc  # noqa: F401
+from .metric import mae  # noqa: F401
+from .metric import max  # noqa: F401
+from .metric import min  # noqa: F401
+from .metric import mse  # noqa: F401
+from .metric import rmse  # noqa: F401
+from .metric import sum  # noqa: F401
 
-__all__ = [
-    "sum",
-    "max",
-    "min",
-    "auc",
-    "mae",
-    "rmse",
-    "mse",
-    "acc",
-]
+__all__ = []
diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
index 9ed0a0df4be0187c5245cf8ea0da3ef06983a8b7..d2050585df754b11f80eaecf7445caacfef47471 100644
--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -18,6 +18,8 @@ import numpy as np
 from paddle.static import Variable
 import paddle
 
+__all__ = []
+
 
 def sum(input, scope=None, util=None):
     """
diff --git a/python/paddle/distributed/fleet/runtime/__init__.py b/python/paddle/distributed/fleet/runtime/__init__.py
index 51d8c6ffebf1dd998b7839f43054d6004279355a..f5c30b2f3c5aaadeb5d1dad2799a08931fc70b17 100644
--- a/python/paddle/distributed/fleet/runtime/__init__.py
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -15,3 +15,5 @@
 from .collective_runtime import CollectiveRuntime
 from .parameter_server_runtime import ParameterServerRuntime
 from .the_one_ps import TheOnePSRuntime
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/runtime/collective_runtime.py b/python/paddle/distributed/fleet/runtime/collective_runtime.py
index c56cf4c7aa2ed86f4529b1bb09d51ce64d86cfc8..a23b15f1fca1ba66a1aff82ec8d1217b930134ad 100644
--- a/python/paddle/distributed/fleet/runtime/collective_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/collective_runtime.py
@@ -15,6 +15,8 @@
 from .runtime_base import RuntimeBase
 import logging
 
+__all__ = []
+
 
 class CollectiveRuntime(RuntimeBase):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 782ba87e07925c8ef12492a1aba02da27b339e9e..0767158d23f00833cb9214902b7983cb44ea89c1 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -26,6 +26,8 @@ from paddle.fluid.framework import Variable, Parameter
 from .runtime_base import RuntimeBase
 from ..base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 
 class ParameterServerRuntime(RuntimeBase):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index abec4710f5dc958eb3f5bd4023bf6b11959af1b9..ce68eb9a1fb4ad09394d7ab87102006663b029a2 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -25,6 +25,8 @@ from paddle.fluid.framework import Variable, Parameter
 from .runtime_base import RuntimeBase
 from ..base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 
 def conv_indent(indent):
     return "".join([" "] * indent)
@@ -77,10 +79,13 @@ class CommonAccessor:
                                  ("Moment2", None), ("Beta1Pow", 1),
                                  ("Beta2Pow", 1), ("LearningRate", 1)]
         opt_input_map["sum"] = [("Param", None)]
+        opt_input_map["naive_adagrad"] = [("Param", None), ("G2Sum", 1),
+                                          ("LearningRate", 1)]
 
         opt_attr_map = {}
         opt_attr_map["sgd"] = []
         opt_attr_map["sum"] = []
+        opt_attr_map["naive_adagrad"] = []
         opt_attr_map["adam"] = [("beta1", "f"), ("beta2", "f"),
                                 ("epsilon", "f")]
 
@@ -150,7 +155,8 @@ class CommonAccessor:
         oop = None
 
         for op in optimizer_ops:
-            if op.input("Param")[0] == param_name:
+            if ("Param" in op.input_names) and (
+                    op.input("Param")[0] == param_name):
                 oop = op
                 break
 
@@ -168,6 +174,10 @@ class CommonAccessor:
             param_varnames = self.opt_input_map["sum"]
             attr_varnames = self.opt_attr_map["sum"]
             self.accessor_class = "sum"
+        elif compiled_strategy.use_ps_gpu and is_sparse:
+            param_varnames = self.opt_input_map["naive_adagrad"]
+            attr_varnames = self.opt_attr_map["naive_adagrad"]
+            self.accessor_class = "sgd"
         else:
             param_varnames = self.opt_input_map[oop.type]
             attr_varnames = self.opt_attr_map[oop.type]
@@ -175,20 +185,28 @@ class CommonAccessor:
 
         for (formal_name, shape) in param_varnames:
             params.append(formal_name)
-            param = main_program.global_block().vars[oop.input(formal_name)[0]]
-            if formal_name == "LearningRate" and param.name != "learning_rate_0":
-                warnings.warn("will support decay soon")
-                param = main_program.global_block().vars["learning_rate_0"]
-
-            if shape is None:
-                if is_sparse:
-                    shape = total_dims
-                else:
-                    shape = self.get_shard(total_dims, pserver_num, pserver_id)
-            dims.append(shape)
+            if formal_name == "G2Sum":
+                dims.append(1)
+                initializer = "fill_constant&0"
+                initializers.append(initializer)
+            else:
+                param = main_program.global_block().vars[oop.input(formal_name)[
+                    0]]
+                if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    warnings.warn("will support decay soon")
+                    param = main_program.global_block().vars["learning_rate_0"]
+
+                if shape is None:
+                    if is_sparse:
+                        shape = total_dims
+                    else:
+                        shape = self.get_shard(total_dims, pserver_num,
+                                               pserver_id)
+                dims.append(shape)
 
-            initializer = self.get_initializer_attr(param.name, startup_program)
-            initializers.append(initializer)
+                initializer = self.get_initializer_attr(param.name,
+                                                        startup_program)
+                initializers.append(initializer)
 
         for (attr_varname, type_) in attr_varnames:
             value = oop.attr(attr_varname)
@@ -434,6 +452,8 @@ class TheOnePSRuntime(RuntimeBase):
         if not strategy:
             raise ValueError("k_steps must be invalid value, please check")
 
+        if dist_strategy.a_sync_configs["use_ps_gpu"]:
+            strategy.use_ps_gpu = True
         return strategy
 
     def build_compiled_startegy(self):
@@ -442,6 +462,8 @@ class TheOnePSRuntime(RuntimeBase):
         compiled_config = CompileTimeStrategy(
             self.origin_main_program, self.origin_main_program,
             self.async_strategy, self.role_maker)
+        if self.async_strategy.use_ps_gpu:
+            compiled_config.use_ps_gpu = True
         return compiled_config
 
     def _init_worker(self):
@@ -452,6 +474,17 @@ class TheOnePSRuntime(RuntimeBase):
         worker = self._get_fleet_proto(is_server=False, is_sync=is_sync)
         server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
 
+        dist_strategy = self.context["valid_strategy"]
+        use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"]
+        if use_ps_gpu:
+            main_program = self.context['loss'].block.program
+            if not main_program._fleet_opt:
+                main_program._fleet_opt = {}
+            main_program._fleet_opt["use_ps_gpu"] = True
+            gpus_env = os.getenv("FLAGS_selected_gpus")
+            main_program._fleet_opt[
+                "worker_places"] = [int(s) for s in gpus_env.split(",")]
+
         def sync_strategy_envs():
             kwargs = {}
             kwargs[
@@ -740,6 +773,11 @@ class TheOnePSRuntime(RuntimeBase):
             downpour_server = DownpourServer()
 
             service = Service()
+            dist_strategy = self.context["valid_strategy"]
+            use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"]
+            if use_ps_gpu:
+                service.server_class = "PsLocalServer"
+                service.client_class = "PsLocalClient"
             downpour_server.set_service_param(service)
 
             tables = _get_tables()
@@ -767,7 +805,7 @@ class TheOnePSRuntime(RuntimeBase):
         server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
         proto_txt = str(server)
 
-        debug = bool(os.getenv("PSERVER_DEBUG", "0"))
+        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
         if debug:
             print("server: \n{}".format(proto_txt))
 
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index 774e8db0df52c9f785dfc2a68d22370b2e96f1a6..1bf90a22e375c7068653d78891237a710bd8d666 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -12,5 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .fs import LocalFS, HDFSClient
-from .ps_util import DistributedInfer
+from .fs import LocalFS  # noqa: F401
+from .fs import HDFSClient  # noqa: F401
+from .ps_util import DistributedInfer  # noqa: F401
+from .recompute import recompute  # noqa: F401
+
+from . import log_util  # noqa: F401
+from . import hybrid_parallel_util  # noqa: F401
+
+__all__ = [  #noqa
+    "LocalFS", "recompute", "DistributedInfer", "HDFSClient"
+]
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 7e62e551fe8d53eb354505bdc86d7c12d9d06726..087942e70a226379778c4df1ceacb4e422b75e5b 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -31,7 +31,7 @@ import functools
 
 import shutil
 
-__all__ = ['LocalFS', 'HDFSClient']
+__all__ = []
 
 
 class ExecuteError(Exception):
diff --git a/python/paddle/distributed/fleet/utils/http_server.py b/python/paddle/distributed/fleet/utils/http_server.py
index 92295cc74ae4d4aa136bc205a48de55e775d010e..a9d0687461b995e9d24390d1004bd38a52922c80 100644
--- a/python/paddle/distributed/fleet/utils/http_server.py
+++ b/python/paddle/distributed/fleet/utils/http_server.py
@@ -28,6 +28,8 @@ import time
 import threading
 import socket
 
+__all__ = []
+
 
 def get_logger(name, level, fmt):
     logger = logging.getLogger(name)
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..5521bd5b95283744b3ced90508a37f983ba4bcf1
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import six
+import numpy as np
+import warnings
+
+from paddle import framework
+import paddle
+from paddle.fluid import core
+from paddle.fluid.dygraph.parallel import _split_tensors, sync_params_buffers, build_groups
+from collections import OrderedDict
+from .log_util import logger
+
+__all__ = []
+
+
+def _apply_collective_grads(parameters, comm_group):
+    grad_var_set = set()
+    grad_vars = []
+    sparse_grad_vars = []
+
+    for param in parameters:
+        if param.trainable and (param._grad_ivar() is not None):
+            g_var = param._grad_ivar()
+            assert not g_var._is_sparse(
+            ), "Now, it doesn't support sparse parameters"
+            grad_vars.append(g_var)
+            assert g_var not in grad_var_set
+            grad_var_set.add(g_var)
+
+    coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
+
+    for coalesced_grad, _, _ in coalesced_grads_and_vars:
+        # need to div nranks
+        coalesced_grad = coalesced_grad / comm_group.nranks
+        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
+
+    _split_tensors(coalesced_grads_and_vars)
+
+
+def _broadcast_data_help(data, shape, dtype, hcg):
+    model_parallel_group = hcg.get_model_parallel_group()
+    src_rank = hcg.get_model_parallel_group_src_rank()
+    mp_rank = hcg.get_model_parallel_rank()
+
+    shape_gpu = paddle.to_tensor(shape, dtype="int32")
+    paddle.distributed.broadcast(
+        shape_gpu,
+        src=src_rank,
+        group=model_parallel_group,
+        use_calc_stream=True)
+
+    if mp_rank != 0:
+        input_data = paddle.zeros(shape_gpu, dtype=dtype)
+    else:
+        input_data = data
+
+    paddle.distributed.broadcast(
+        input_data,
+        src=src_rank,
+        group=model_parallel_group,
+        use_calc_stream=True)
+
+
+def broadcast_input_data(hcg, *inputs, **kwargs):
+    for v in inputs:
+        if isinstance(v, core.VarBase):
+            with framework.no_grad():
+                _broadcast_data_help(v, v.shape, v.dtype, hcg)
+        else:
+            logger.error("it doesn't support data type {}".format(type(v)))
+
+    for k, v in kwargs.items():
+        if isinstance(v, core.VarBase):
+            with framework.no_grad():
+                _broadcast_data_help(v, v.shape, v.dtype, hcg)
+            kwargs[k] = v
+        else:
+            logger.error("it doesn't support data type {}".format(type(v)))
+    return inputs, kwargs
+
+
+def broadcast_mp_parameters(model, hcg):
+    model_parallel_group = hcg.get_model_parallel_group()
+    src_rank = hcg.get_model_parallel_group_src_rank()
+    sync_params_buffers(
+        model, model_parallel_group, src_rank, is_model_parallel=True)
+
+
+def broadcast_dp_parameters(model, hcg):
+    data_parallel_group = hcg.get_data_parallel_group()
+    src_rank = hcg.get_data_parallel_group_src_rank()
+    sync_params_buffers(
+        model, data_parallel_group, src_rank, is_model_parallel=False)
+
+
+def fused_allreduce_gradients(parameter_list, hcg):
+    data_parallel_group = hcg.get_data_parallel_group()
+    logger.debug("dp start fuse allreduce gradients")
+    with framework.no_grad():
+        _apply_collective_grads(parameter_list, data_parallel_group)
diff --git a/python/paddle/distributed/fleet/utils/log_util.py b/python/paddle/distributed/fleet/utils/log_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..77eb641e0c6fe4e358575f857934c6e3afe82489
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/log_util.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import sys
+
+__all__ = []
+
+
+class LoggerFactory:
+    @staticmethod
+    def build_logger(name=None, level=logging.INFO):
+        assert name is not None, "name for logger should not be None"
+
+        formatter = logging.Formatter(
+            "%(asctime)s-%(levelname)s: "
+            "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
+
+        _logger = logging.getLogger(name)
+        _logger.setLevel(level)
+        _logger.propagate = False
+        handler = logging.StreamHandler(stream=sys.stderr)
+        handler.setFormatter(formatter)
+        handler.setLevel(level)
+        _logger.addHandler(handler)
+        return _logger
+
+
+logger = LoggerFactory.build_logger(name="HybridParallel", level=logging.INFO)
+
+
+def layer_to_str(base, *args, **kwargs):
+    name = base + "("
+    if args:
+        name += ", ".join(str(arg) for arg in args)
+        if kwargs:
+            name += ", "
+    if kwargs:
+        name += ", ".join("{}={}".format(key, str(value))
+                          for key, value in kwargs.items())
+    name += ")"
+    return name
diff --git a/python/paddle/distributed/fleet/utils/ps_util.py b/python/paddle/distributed/fleet/utils/ps_util.py
index a409d02c984cf222ad3c67b22d1fb2753e03b0a3..8bf69a41a7cc839d9dadf724a00f87e934425ac1 100644
--- a/python/paddle/distributed/fleet/utils/ps_util.py
+++ b/python/paddle/distributed/fleet/utils/ps_util.py
@@ -16,6 +16,9 @@
 import numpy as np
 import os
 import paddle
+import warnings
+
+__all__ = []
 
 
 class DistributedInfer:
@@ -104,8 +107,6 @@ class DistributedInfer:
                 vars=need_load_vars)
 
     def get_dist_infer_program(self):
-        import paddle.distributed.fleet as fleet
-
         varname2tables = self._get_sparse_table_map()
         convert_program = self._convert_program(self.origin_main_program,
                                                 varname2tables)
@@ -185,6 +186,7 @@ class DistributedInfer:
                                 "is_distributed": is_distributed,
                                 "padding_idx": padding_idx,
                                 "table_id": table_id,
+                                "is_test": True,
                                 "lookup_table_version": op_type
                             })
                     else:
@@ -193,6 +195,9 @@ class DistributedInfer:
                         )
 
             pull_sparse_ops = _get_pull_sparse_ops(program)
+            warnings.warn(
+                "lookup_table will be forced to test mode when use DistributedInfer"
+            )
             _pull_sparse_fuse(program, pull_sparse_ops)
             return program
 
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
new file mode 100644
index 0000000000000000000000000000000000000000..e58c8aa1625ddecc6f80810d0266958a75ea4956
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid import core
+from paddle.autograd import PyLayer
+from paddle.fluid import framework
+import contextlib
+
+import logging
+logger = logging.getLogger(__name__)
+formatter = logging.Formatter(
+    fmt='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+
+__all__ = []
+
+
+def detach_variable(inputs):
+    out = []
+    for inp in inputs:
+        if not isinstance(inp, core.VarBase):
+            out.append(inp)
+            continue
+
+        x = inp.detach()
+        x.stop_gradient = inp.stop_gradient
+        out.append(x)
+    return tuple(out)
+
+
+def check_recompute_necessary(inputs):
+    if not any(input_.stop_gradient == False for input_ in inputs
+               if isinstance(input_, paddle.Tensor)):
+        logger.warn(
+            "[Recompute]: None of the inputs to current recompute block need grad, "
+            "therefore there is NO need to recompute this block in backward !")
+
+
+@contextlib.contextmanager
+def swith_rng_state(rng_state):
+    orig_cuda_rng_state = paddle.get_cuda_rng_state()
+    paddle.set_cuda_rng_state(rng_state)
+    try:
+        yield
+    finally:
+        paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+class RecomputeFunction(PyLayer):
+    @staticmethod
+    def forward(ctx, run_function, preserve_rng_state, *args):
+        check_recompute_necessary(args)
+
+        # store for recomputing 
+        ctx.run_function = run_function
+        ctx.preserve_rng_state = preserve_rng_state
+
+        # NOTE the number of outputs of backward() should be equal to the number of tensors in forward()'s input
+        # the order of tensors in backward()'s output should be the same as tensors in forward()'s input
+        # None tensor inputs will be filtered in backward inputs.
+
+        # save input for backward
+        ctx.inputs = []
+        ctx.tensor_indices = []
+        tensor_inputs = []
+        for i, arg in enumerate(args):
+            if paddle.is_tensor(arg):
+                tensor_inputs.append(arg)
+                ctx.tensor_indices.append(i)
+                ctx.inputs.append(None)
+            else:
+                ctx.inputs.append(arg)
+        ctx.save_for_backward(*tensor_inputs)
+
+        # NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
+        # one process with multiple gpu and mix-gpu-cpu senarios are not support
+        if ctx.preserve_rng_state:
+            cur_device = paddle.get_device()
+            if 'gpu:' not in cur_device:
+                raise RuntimeError(
+                    "Recompute with RNG perserve is not support current device: {}.".
+                    format(cur_device))
+            ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
+
+        # TODO support AMP
+
+        with paddle.no_grad():
+            outputs = run_function(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        with paddle.fluid.dygraph.guard():
+            # TODO need to check the recompute calling is vaild or not
+
+            # Restore inputs
+            inputs = list(ctx.inputs)
+            tensor_indices = ctx.tensor_indices
+            tensors = ctx.saved_tensor()
+            for i, idx in enumerate(tensor_indices):
+                inputs[idx] = tensors[i]
+
+            # paddle.enable_grad()
+            tracer = framework._dygraph_tracer()
+            tracer._has_grad = True
+
+            # TODO support AMP
+
+            if ctx.preserve_rng_state:
+                with swith_rng_state(ctx.fw_cuda_rng_state):
+                    detached_inputs = detach_variable(tuple(inputs))
+                    outputs = ctx.run_function(*detached_inputs)
+            else:
+                detached_inputs = detach_variable(tuple(inputs))
+                outputs = ctx.run_function(*detached_inputs)
+
+            if isinstance(outputs, core.VarBase):
+                outputs = (outputs, )
+            assert len(outputs) == len(args)
+
+            # run backward() with only tensor that requires grad
+            forward_outputs_with_grad = []
+            backward_inputs = list(args)
+            for i in range(len(outputs)):
+                if isinstance(outputs[i],
+                              core.VarBase) and not outputs[i].stop_gradient:
+                    forward_outputs_with_grad.append(outputs[i])
+            if len(forward_outputs_with_grad) == 0:
+                raise RuntimeError(
+                    "none of output has requires_grad=True, this recompute() is not necessary"
+                )
+
+            assert len(backward_inputs) == len(
+                forward_outputs_with_grad
+            ), "number of forward outputs is [{}], but the backward got [{}] inputs".format(
+                len(forward_outputs_with_grad), len(backward_inputs))
+
+            # actually backward            
+            paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
+
+            grads = list(inp._grad_ivar() for inp in detached_inputs
+                         if isinstance(inp, core.VarBase))
+
+            return grads
+
+
+def recompute(function, *args, **kwargs):
+    """
+    recompute intermediate activations to save then memory.
+
+    Args:
+        function: layer of sequence of layers that describes part of forward pass of the model whose 
+        intermediate activations will be released to save memory in forward stage and will be recomputed 
+        in backward stage for gradient calculation.
+        preserve_rng_state(bool, optional):  if preserve the RNG state of forward and restore it in backward. 
+        args: inputs to the function
+
+    Returns:
+        Output of function on args
+    """
+    # Hack to mix *args with **kwargs in a python 2.7-compliant way
+    preserve = kwargs.pop('preserve_rng_state', True)
+    if kwargs:
+        raise ValueError("Unexpected keyword arguments: " + ",".join(
+            arg for arg in kwargs))
+
+    return RecomputeFunction.apply(function, preserve, *args)
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index df3a3407bf5cf679d7915f097896dd33d0717cd0..e02a439025b77f9ae612aa790bc521b521fb481f 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -14,3 +14,5 @@
 
 from paddle.distributed.fleet import launch
 launch.launch()
+
+__all__ = []
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 582c0be713f4efa100c8952c20211edbc172349f..bc042e722947a0f0293f655a5d58f7823c0d0d03 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -15,7 +15,8 @@
 import os
 import six
 import warnings
-from multiprocessing import Process, Manager
+from multiprocessing import Process  # noqa: F401
+from multiprocessing import Manager  # noqa: F401
 import time
 import sys
 
@@ -26,9 +27,11 @@ from paddle.fluid import core
 from paddle.fluid.framework import _set_expected_place
 from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready  # noqa: F401
 
-__all__ = ["init_parallel_env"]
+__all__ = [  #noqa
+    "init_parallel_env"
+]
 
 ParallelStrategy = core.ParallelStrategy
 
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 56e59ac88efee7a252a8311b1d4c297ddb58c7e0..c46672dca09e97dadf9b49cf6ab2dc44931ba83f 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -21,7 +21,9 @@ import six
 import sys
 import warnings
 
-from paddle.distributed.utils import _print_arguments, _prepare_trainer_env, get_host_name_ip
+from paddle.distributed.utils import _print_arguments
+from paddle.distributed.utils import _prepare_trainer_env
+from paddle.distributed.utils import get_host_name_ip
 from paddle.distributed.cloud_utils import get_cluster_and_pod
 from paddle.distributed.fleet.cloud_utils import use_paddlecloud
 from paddle.device import get_device
@@ -30,6 +32,8 @@ from paddle.device import get_device
 from paddle.fluid import core
 from paddle.fluid.framework import _cpu_num, set_flags
 
+__all__ = []
+
 
 class ParallelEnvArgs(object):
     def __init__(self):
@@ -303,8 +307,8 @@ class MultiprocessContext(object):
                 raise Exception("Process %d terminated with signal %s." %
                                 (error_index, name))
             else:
-                raise Exception("Process %d terminated with exit code %d." & (
-                    error_index, exitcode))
+                raise Exception("Process %d terminated with exit code %d." %
+                                (error_index, exitcode))
 
         original_trace = self.error_queues[error_index].get()
         msg = "\n\n----------------------------------------------\n" \
@@ -325,7 +329,7 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
         func (function): The target function is called by spawned process.
             This function need to be able to pickled, so it must be defined
             at the top level of a module.
-        args (tuple, optional): Arguments passed to ``func``.
+        args (list|tuple, optional): Arguments passed to ``func``.
         nprocs (int, optional): Number of processed to start. Default: -1.
             when nprocs is -1, the available device will be obtained from 
             the environment variable when the model is executed: If use GPU, 
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index f40a7b31b83e6fe6b37d150c15f892763e29adef..e84025c2eb6d204be470b586e1adb6d739b76f52 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -26,6 +26,24 @@ from contextlib import closing
 import socket
 from paddle.fluid import core
 
+__all__ = [     #noqa
+           'get_host_name_ip',
+           'Trainer',
+           'get_cluster',
+           'start_local_trainers',
+           'watch_local_trainers',
+           'find_free_ports',
+           'JobServer',
+           'Cluster',
+           'Pod',
+           'Hdfs',
+           'add_arguments',
+           'terminate_local_procs',
+           'TrainerProc',
+           'get_logger',
+           'pull_worker_log'
+]
+
 logger = logging.getLogger("root")
 logger.propagate = False
 
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index 7f0d71e3877f7c0c5ac8cd85d7eba6db60cfd718..d866f74b0e8b3b3a1bf9115b2389187b5fcde4f2 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -105,7 +105,7 @@ class Distribution(object):
         for arg in args:
             if isinstance(arg, float):
                 arg = [arg]
-            if not isinstance(arg, (list, np.ndarray, tensor.Variable)):
+            if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)):
                 raise TypeError(
                     "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".
                     format(type(arg)))
@@ -190,8 +190,8 @@ class Uniform(Distribution):
     [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
 
     Args:
-        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
-        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -229,10 +229,10 @@ class Uniform(Distribution):
     def __init__(self, low, high, name=None):
         if not in_dygraph_mode():
             check_type(low, 'low',
-                       (int, float, np.ndarray, tensor.Variable, list),
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
                        'Uniform')
             check_type(high, 'high',
-                       (int, float, np.ndarray, tensor.Variable, list),
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
                        'Uniform')
 
         self.all_arg_is_float = False
@@ -409,8 +409,8 @@ class Normal(Distribution):
     * :math:`Z`: is the normalization constant.
 
     Args:
-        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
-        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -451,10 +451,10 @@ class Normal(Distribution):
     def __init__(self, loc, scale, name=None):
         if not in_dygraph_mode():
             check_type(loc, 'loc',
-                       (int, float, np.ndarray, tensor.Variable, list),
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
                        'Normal')
             check_type(scale, 'scale',
-                       (int, float, np.ndarray, tensor.Variable, list),
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
                        'Normal')
 
         self.batch_size_unknown = False
@@ -655,7 +655,7 @@ class Categorical(Distribution):
     * :math:`[x=i]` : it evaluates to 1 if :math:`x==i` , 0 otherwise.
 
     Args:
-        logits(list|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+        logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -702,11 +702,12 @@ class Categorical(Distribution):
     def __init__(self, logits, name=None):
         """
         Args:
-            logits(list|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+            logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
             name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
         """
         if not in_dygraph_mode():
-            check_type(logits, 'logits', (np.ndarray, tensor.Variable, list),
+            check_type(logits, 'logits',
+                       (np.ndarray, tensor.Variable, list, tuple),
                        'Categorical')
 
         self.name = name if name is not None else 'Categorical'
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 1a88d3512eaaaaa9e05a70231d169449c87f8ca5..3b73034dfde2e925a219331e5cd40f1b0537db0f 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import os
 import sys
+import atexit
 
 # The legacy core need to be removed before "import core",
 # in case of users installing paddlepadde without -U option
@@ -68,9 +69,9 @@ from .input import embedding, one_hot
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
-from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope
+from .core import LoDTensor, LoDTensorArray, Scope, _Scope
+from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
 from .incubate import fleet
-from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
@@ -124,6 +125,7 @@ __all__ = framework.__all__ + executor.__all__ + \
         'XPUPlace',
         'CUDAPlace',
         'CUDAPinnedPlace',
+        'NPUPlace',
         'Tensor',
         'ParamAttr',
         'WeightNormParamAttr',
@@ -175,7 +177,6 @@ def __bootstrap__():
     sysstr = platform.system()
     read_env_flags = [
         'check_nan_inf',
-        'fast_check_nan_inf',
         'benchmark',
         'eager_delete_scope',
         'fraction_of_cpu_memory_to_use',
@@ -231,7 +232,18 @@ def __bootstrap__():
             'gpu_allocator_retry_time',
             'local_exe_sub_scope_limit',
             'gpu_memory_limit_mb',
+            'conv2d_disable_cudnn',
         ]
+
+    if core.is_compiled_with_npu():
+        read_env_flags += [
+            'selected_npus',
+            'fraction_of_gpu_memory_to_use',
+            'initial_gpu_memory_in_mb',
+            'reallocate_gpu_memory_in_mb',
+            'gpu_memory_limit_mb',
+        ]
+
     core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
     # don't init_p2p when in unittest to save time.
@@ -243,3 +255,8 @@ def __bootstrap__():
 monkey_patch_variable()
 __bootstrap__()
 monkey_patch_varbase()
+
+# NOTE(zhiqiu): register npu_finalize on the exit of Python,
+# do some clean up manually.
+if core.is_compiled_with_npu():
+    atexit.register(core.npu_finalize)
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
old mode 100644
new mode 100755
index 33e2e387a82758ba9cd59dc40d41fb5ad05ee29b..25412a86a8b940b9cba7210fbd17271955295bd1
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -115,7 +115,7 @@ class ProgramStats(object):
         updated_min_idx = min_idx
         while idx_ > pre_segment_end_idx:
             if is_amp_cast(self.ops[idx_]):
-                _logger.debug("found amp-cast op: {}, : {}".format(self.ops[
+                _logger.info("found amp-cast op: {}, : {}".format(self.ops[
                     idx_].desc.type(), self.ops[idx_].desc.input_arg_names()[
                         0]))
                 updated_min_idx = idx_
@@ -155,7 +155,7 @@ class ProgramStats(object):
         sorted_checkpoints = []
         for name in checkpoints_name:
             if name not in self.var_op_deps:
-                _logger.debug(
+                _logger.info(
                     "Recompute Optimizer: deleted %s from checkpoints, because it is not used in paddle program."
                     % name)
             elif self.var_op_deps[name]["var_as_output_ops"] == []:
@@ -233,6 +233,8 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(desc)
             new_op_desc._set_attr(op_role_attr_name, backward)
+            if desc.has_attr('op_device'):
+                new_op_desc._set_attr('op_device', desc.attr('op_device'))
             result_descs.append(new_op_desc)
     return result_descs
 
@@ -252,6 +254,8 @@ def _add_descs_to_block(descs, block):
         new_op_desc = block.desc.append_op()
         new_op_desc.copy_from(desc)
         new_op_desc._set_attr(op_role_attr_name, backward)
+        if desc.has_attr('op_device'):
+            new_op_desc._set_attr('op_device', desc.attr('op_device'))
         result_descs.append(new_op_desc)
     return result_descs
 
@@ -784,7 +788,6 @@ def _append_backward_ops_with_checkpoints_(
         start_idx = 0
         pre_segment_end_idx = -1
         while True:
-            _logger.debug("FW op range[0] - [{}]".format(len(ops)))
             if start_idx >= len(checkpoints_name) - 1:
                 break
             # min_idx: checkpoint_1' s input op
@@ -797,6 +800,9 @@ def _append_backward_ops_with_checkpoints_(
                 min_idx = program_stat._update_segment_start(
                     min_idx, pre_segment_end_idx)
                 segments.append([min_idx, max_idx + 1])
+            else:
+                _logger.info("Could not recompute op range [{}] - [{}] ".format(
+                    min_idx, max_idx + 1))
 
             start_idx += 1
 
@@ -806,15 +812,15 @@ def _append_backward_ops_with_checkpoints_(
         recompute_segments = segments
 
     for i, (idx1, idx2) in enumerate(recompute_segments):
-        _logger.debug("recompute segment[{}]".format(i))
-        _logger.debug("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
+        _logger.info("recompute segment[{}]".format(i))
+        _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
         ), ops[idx1].desc.input_arg_names()))
-        _logger.debug("segment end op: [{}]: [{}]".format(ops[
+        _logger.info("segment end op: [{}]: [{}]".format(ops[
             idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
-        _logger.debug("recompute segment[{}]".format(i))
-        _logger.debug("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
+        _logger.info("recompute segment[{}]".format(i))
+        _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
         ), ops[idx1].desc.input_arg_names()))
-        _logger.debug("segment end op: [{}]: [{}]".format(ops[
+        _logger.info("segment end op: [{}]: [{}]".format(ops[
             idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
 
     # 2) go through all forward ops and induct all variables that will be hold in memory
@@ -825,9 +831,7 @@ def _append_backward_ops_with_checkpoints_(
             program_stat.get_out_of_subgraph_vars(segment[0], segment[1]))
 
     cross_vars = set(vars_should_be_hold) - set(checkpoints_name)
-    _logger.debug("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \
-    len(cross_vars), cross_vars))
-    _logger.debug("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \
+    _logger.info("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \
     len(cross_vars), cross_vars))
 
     # b. output of seed op should be kept in memory
@@ -843,6 +847,7 @@ def _append_backward_ops_with_checkpoints_(
     vars_in_memory = vars_should_be_hold + checkpoints_name
 
     max_calculated_op_position = len(ops)
+    device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
     if recompute_segments == []:
         gap_ops = ops[0:max_calculated_op_position]
         for op in reversed(gap_ops):
@@ -852,6 +857,11 @@ def _append_backward_ops_with_checkpoints_(
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+            # Set device for grad_op according to forward Op
+            if op.desc.has_attr(device_attr_name):
+                op_device = op.desc.attr(device_attr_name)
+                for op_desc in grad_op_desc:
+                    op_desc._set_attr(device_attr_name, op_device)
             added_descs = _add_descs_to_block(grad_op_desc, local_block)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
@@ -866,6 +876,11 @@ def _append_backward_ops_with_checkpoints_(
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+            # Set device for grad_op according to forward Op
+            if op.desc.has_attr(device_attr_name):
+                op_device = op.desc.attr(device_attr_name)
+                for op_desc in grad_op_desc:
+                    op_desc._set_attr(device_attr_name, op_device)
             added_descs = _add_descs_to_block(grad_op_desc, local_block)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
@@ -888,6 +903,17 @@ def _append_backward_ops_with_checkpoints_(
                     continue
                 if name not in var_name_dict:
                     var_name_dict[name] = name + var_suffix
+
+                    # we should create the rename var in subprog, otherwise its VarType will be BOOL
+                    ref_var = block.program.global_block().var(name)
+                    block.create_var(
+                        name=var_name_dict[name],
+                        shape=ref_var.shape,
+                        dtype=ref_var.dtype,
+                        type=ref_var.type,
+                        persistable=ref_var.persistable,
+                        stop_gradient=ref_var.stop_gradient)
+
         # 3.a. add ops in current recompute_segment as forward recomputation ops
         buffer_descs = _add_needed_descs_to_block(ff_ops, buffer_block, block,
                                                   vars_in_memory)
@@ -1010,7 +1036,7 @@ def _append_backward_ops_(block,
             val(list) the op path of block(index)
     """
     if callbacks is not None:
-        assert (isinstance(callbacks, list))
+        assert (isinstance(callbacks, (list, tuple)))
         for cb in callbacks:
             if not hasattr(cb, '__call__'):
                 raise ValueError("'callback' must be a callable object.")
@@ -1131,7 +1157,7 @@ def _append_backward_ops_(block,
         new_op_desc._set_attr(op_role_attr_name, backward)
         grad_to_var["__current_op_desc__"] = new_op_desc
         if callbacks is not None:
-            assert (isinstance(callbacks, list))
+            assert (isinstance(callbacks, (list, tuple)))
             for cb in callbacks:
                 cb(block=target_block, context=grad_to_var)
 
@@ -1354,7 +1380,7 @@ def append_backward(loss,
 
     Parameters:
         loss(Tensor): The loss Tensor of the network.
-        parameter_list(list[Tensor|str], optional): List of Parameters or Parameter.names
+        parameter_list(list[Tensor|str]|tuple[Tensor|str], optional): List/Tuple of Parameters or Parameter.names
                                            that need to be updated by optimizers.
                                            If it is None, all parameters
                                            will be updated.
@@ -1365,7 +1391,7 @@ def append_backward(loss,
                                be automatically added into this set.
                                If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                                Default: None.
-        callbacks(list[callable object], optional): List of callback functions.
+        callbacks(list[callable object]|tuple[callable object], optional): List/Tuple of callback functions.
                                                The callbacks are used for
                                                doing some custom jobs during
                                                backward part building. All
@@ -1451,7 +1477,7 @@ def append_backward(loss,
                       int(core.op_proto_and_checker_maker.OpRole.Loss))
 
     if callbacks is not None:
-        check_type(callbacks, 'callbacks', list,
+        check_type(callbacks, 'callbacks', (list, tuple),
                    'paddle.static.append_backward')
 
     program = loss.block.program
@@ -1797,9 +1823,9 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     Backpropagate the gradients of targets to inputs.
 
     Args:
-        targets(Tensor|list[Tensor]): The target Tensors
-        inputs(Tensor|list[Tensor]): The input Tensors
-        target_gradients (Tensor|list[Tensor], optional): The gradient Tensors
+        targets(Tensor|list[Tensor]|tuple[Tensor]): The target Tensors
+        inputs(Tensor|list[Tensor]|tuple[Tensor]): The input Tensors
+        target_gradients (Tensor|list[Tensor]|tuple[Tensor], optional): The gradient Tensors
             of targets which has the same shape with targets, If None, ones will
             be created for them.
         no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
@@ -1936,9 +1962,9 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
     Backpropagate the gradients of targets to inputs.
 
     Args:
-        targets (Tensor|list[Tensor]): The target Tensors.
-        inputs (Tensor|list[Tensor]): The input Tensors.
-        target_gradients (Tensor|list[Tensor], optional): The gradient Tensor
+        targets (Tensor|list[Tensor]|tuple[Tensor]): The target Tensors.
+        inputs (Tensor|list[Tensor]|tuple[Tensor]): The input Tensors.
+        target_gradients (Tensor|list[Tensor]|tuple[Tensor], optional): The gradient Tensor
             of targets which has the same shape with targets, If None, ones will
             be created for them.
         no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
@@ -1966,12 +1992,12 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
             z = paddle.static.gradients([y], x)
             print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
     """
-    check_type(targets, 'targets', (framework.Variable, list),
+    check_type(targets, 'targets', (framework.Variable, list, tuple),
                'paddle.static.gradients')
-    check_type(inputs, 'inputs', (framework.Variable, list),
+    check_type(inputs, 'inputs', (framework.Variable, list, tuple),
                'paddle.static.gradients')
     check_type(target_gradients, 'target_gradients', (
-        framework.Variable, list, type(None)), 'paddle.static.gradients')
+        framework.Variable, list, tuple, type(None)), 'paddle.static.gradients')
 
     outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
     return _as_list(outs)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index df41e649ca8cbeb1eff9bec1064d1a229d359c60..30981f531289aefffb46f94dff9ed77c0804b253 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,11 +22,7 @@ from . import op_frequence
 from .op_frequence import *
 from . import quantize
 from .quantize import *
-from . import reader
-from .reader import *
 from . import slim
-from . import utils
-from .utils import *
 from . import extend_optimizer
 from .extend_optimizer import *
 from . import model_stat
@@ -42,8 +38,6 @@ __all__ += decoder.__all__
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
-__all__ += reader.__all__
-__all__ += utils.__all__
 __all__ += extend_optimizer.__all__
 __all__ += ['mixed_precision']
 __all__ += layers.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
index 3bfc078971d7a4f18fcd37ff9de2740edb9778e0..588eb2a29f555a09a7c1bf5c7512198b999eeccd 100644
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -20,7 +20,7 @@ from paddle.fluid import core
 __all__ = ['check_finite_and_unscale', 'update_loss_scaling']
 
 
-def check_finite_and_unscale(x, scale, name=None):
+def check_finite_and_unscale(x, scale, name=None, float_status=None):
     """
     Check if input X contains all finite data, if yes, scale it by input Scale.
 
@@ -30,9 +30,11 @@ def check_finite_and_unscale(x, scale, name=None):
     FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
     Out should not be used, and its data may not be deterministic. 
     Otherwise, FoundInfinite will be 0 (False).
+
     Args:
         x(list|tuple): The input tensors of check_finite_and_unscale operator.
         scale: The scale of check_finite_and_unscale operator.
+        float_status(Tensor): (Only used on NPU) The float status to check overflow.
     """
     check_type(x, 'x', (tuple, list), 'check_finite_and_unscale')
     for e in x:
@@ -43,6 +45,11 @@ def check_finite_and_unscale(x, scale, name=None):
     found_inf = helper.create_variable_for_type_inference(dtype='bool')
 
     inputs = {'X': x, 'Scale': scale}
+    if core.is_compiled_with_npu():
+        check_variable_and_dtype(float_status, "float_status",
+                                 ['float16', 'float32'],
+                                 'check_finite_and_unscale')
+        inputs['FloatStatus'] = float_status
     outputs = {'Out': x, 'FoundInfinite': found_inf}
     helper.append_op(
         type='check_finite_and_unscale', inputs=inputs, outputs=outputs)
diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py
similarity index 66%
rename from python/paddle/fluid/contrib/utils/__init__.py
rename to python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py
index 1c1c2fb22709189ca03dc543ca551257c8031c1a..d3632729a3b02a166a2741638bba6e781b4e68ba 100644
--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,11 +13,14 @@
 # limitations under the License.
 
 from __future__ import print_function
-from . import lookup_table_utils
-from .lookup_table_utils import *
-from . import hdfs_utils
-from .hdfs_utils import *
 
-__all__ = []
-__all__ += lookup_table_utils.__all__
-__all__ += hdfs_utils.__all__
+from . import amp_lists
+from .amp_lists import *
+from . import amp_utils
+from .amp_utils import *
+from . import decorator
+from .decorator import *
+
+__all__ = decorator.__all__
+__all__ += amp_lists.__all__
+__all__ += amp_utils.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a4dc8ed9afcc42501c6848a6a3f2b18260903be
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from paddle.fluid import core
+
+from ..fp16_lists import white_list as white_list_fp16, black_list as black_list_fp16,\
+    gray_list as gray_list_fp16
+
+__all__ = ["AutoMixedPrecisionListsBF16"]
+
+
+class AutoMixedPrecisionListsBF16(object):
+    """
+    AutoMixedPrecisionListsBF16 is a class for fp32/bf16 op types list. The lists are used for an
+    algorithm which determines op's execution mode (fp32 or bf16).It can update pre-defined
+    fp32 list and bf16 list according to users' custom fp32 bf16 lists.
+
+    Args:
+        custom_bf16_list (set): Users' custom bf16 list.
+        custom_fp32_list (set): Users' custom fp32 list.
+        custom_fp32_varnames (set): Users' custom fp32 variables' names.
+
+    Examples:
+        .. code-block:: python
+        import paddle
+        paddle.enable_static()
+        with paddle.static.amp.bf16_guard():
+            paddle.static.amp.AutoMixedPrecisionListsBF16(custom_fp32_list={'lstm'})
+    """
+
+    def __init__(self,
+                 custom_bf16_list=None,
+                 custom_fp32_list=None,
+                 custom_fp32_varnames=None):
+        self._custom_bf16_list = custom_bf16_list
+        self._custom_fp32_list = custom_fp32_list
+        self.bf16_list = copy.copy(bf16_list)
+        self.fp32_list = copy.copy(fp32_list)
+        self.gray_list = copy.copy(gray_list)
+        self.bf16_initializer_list = copy.copy(bf16_initializer_list)
+        self.unsupported_list = copy.copy(unsupported_list)
+        self.fp32_varnames = copy.copy(custom_fp32_varnames)
+        self._update_list()
+
+    def _update_list(self):
+        """
+        Update fp32 and bf16 list according to users' custom list.
+        """
+        if self._custom_bf16_list and self._custom_fp32_list:
+            for op_name in self._custom_bf16_list:
+                if op_name in self._custom_fp32_list:
+                    raise ValueError("Custom bf16 list overlap "
+                                     "custom fp32 list")
+        if self._custom_bf16_list:
+            for op_name in self._custom_bf16_list:
+                if op_name in self.fp32_list:
+                    self.fp32_list.remove(op_name)
+                elif op_name in self.gray_list:
+                    self.gray_list.remove(op_name)
+                self.bf16_list.add(op_name)
+        if self._custom_fp32_list:
+            for op_name in self._custom_fp32_list:
+                if op_name in self.bf16_list:
+                    self.bf16_list.remove(op_name)
+                elif op_name in self.gray_list:
+                    self.gray_list.remove(op_name)
+                self.fp32_list.add(op_name)
+                self.unsupported_list.add(op_name)
+
+
+bf16_initializer_list = {'fill_constant', 'uniform_random'}
+
+# always bf16
+bf16_list = {'elementwise_add', }
+
+# depends on the prev_op type
+gray_list = {
+    'cast',
+    'fill_constant',
+    'reduce_mean',
+    'reshape2',
+    'scale',
+}
+
+_, _, _sys_unsupported_bf16_list = core.op_supported_infos(
+    'CPU', core.VarDesc.VarType.BF16)
+unsupported_list = _sys_unsupported_bf16_list
+
+fp32_list = black_list_fp16.copy().copy()
+fp32_list |= white_list_fp16
+fp32_list |= gray_list_fp16
+
+fp32_list -= bf16_list
+fp32_list -= gray_list
+unsupported_list -= bf16_list
+unsupported_list -= gray_list
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4551947e0fad24d87e4e56a9bd963ca9e9d404a8
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
@@ -0,0 +1,552 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from .... import core
+from .... import framework
+from .... import global_scope
+from ....log_helper import get_logger
+from ....wrapped_decorator import signature_safe_contextmanager
+from .amp_lists import AutoMixedPrecisionListsBF16
+from ..fp16_utils import find_true_prev_op, find_true_post_op, _rename_arg, \
+    find_op_index, _rename_op_input
+
+import collections
+import struct
+import logging
+import numpy as np
+
+__all__ = [
+    "bf16_guard", "rewrite_program_bf16", "cast_model_to_bf16",
+    "cast_parameters_to_bf16", "convert_float_to_uint16"
+]
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+_valid_types = [
+    core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS,
+    core.VarDesc.VarType.LOD_TENSOR_ARRAY
+]
+
+_bf16_guard_pattern = "__use_bf16__"
+
+
+def convert_float_to_uint16(in_list):
+    in_list = np.asarray(in_list)
+    out = np.vectorize(
+        lambda x: struct.unpack('<I', struct.pack('<f', x))[0] >> 16,
+        otypes=[np.uint16])(in_list.flat)
+    return np.reshape(out, in_list.shape)
+
+
+def _dtype_to_str(dtype):
+    """
+    Convert specific variable type to its corresponding string.
+
+    Args:
+        dtype (VarType): Variable type.
+    """
+    if dtype == core.VarDesc.VarType.BF16:
+        return 'bf16'
+    else:
+        return 'fp32'
+
+
+def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
+    """
+    Insert cast op and rename args of input and output.
+
+    Args:
+        block (Program): The block in which the operator is.
+        op (Operator): The operator to insert cast op.
+        idx (int): The index of current operator.
+        src_dtype (VarType): The input variable dtype of cast op.
+        dest_dtype (VarType): The output variable dtype of cast op.
+
+    Returns:
+        num_cast_op (int): The number of cast ops that have been inserted.
+    """
+    num_cast_ops = 0
+
+    for in_name in op.input_names:
+        if src_dtype == core.VarDesc.VarType.FP32 and op.type in [
+                'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+        ]:
+            if in_name not in {'X', 'Z'}:
+                continue
+        for in_var_name in op.input(in_name):
+            in_var = block.var(in_var_name)
+            if in_var.type not in _valid_types or in_var.dtype == dest_dtype:
+                continue
+            if in_var.dtype == src_dtype:
+                cast_name = in_var.name + '.cast_' + _dtype_to_str(dest_dtype)
+                out_var = block.vars.get(cast_name)
+                if out_var is None or out_var.dtype != dest_dtype:
+                    out_var = block.create_var(
+                        name=cast_name,
+                        dtype=dest_dtype,
+                        persistable=False,
+                        stop_gradient=in_var.stop_gradient)
+
+                    block._insert_op(
+                        idx,
+                        type="cast",
+                        inputs={"X": in_var},
+                        outputs={"Out": out_var},
+                        attrs={
+                            "in_dtype": in_var.dtype,
+                            "out_dtype": out_var.dtype
+                        })
+                    num_cast_ops += 1
+                _rename_arg(op, in_var.name, out_var.name)
+            else:
+                if op.has_attr('in_dtype'):
+                    op._set_attr('in_dtype', dest_dtype)
+    if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.BF16:
+        for out_name in op.output_names:
+            if op.type in [
+                    'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+            ] and out_name != 'Y':
+                continue
+            for out_var_name in op.output(out_name):
+                out_var = block.var(out_var_name)
+                if out_var.type not in _valid_types:
+                    continue
+                if out_var.dtype == core.VarDesc.VarType.FP32:
+                    out_var.desc.set_dtype(core.VarDesc.VarType.BF16)
+                    if op.has_attr('out_dtype'):
+                        op._set_attr('out_dtype', core.VarDesc.VarType.BF16)
+    return num_cast_ops
+
+
+def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
+                         op_var_rename_map):
+    num_cast_ops = 0
+    target_var = block.var(target_name)
+    if target_var.type not in _valid_types or target_var.dtype == dest_dtype:
+        return num_cast_ops
+
+    assert target_var.dtype == src_dtype, \
+        "The real dtype({}) is not equal to the src dtype({})".format(_dtype_to_str(target_var.dtype), _dtype_to_str(src_dtype))
+
+    cast_name = target_var.name + '.cast_' + _dtype_to_str(dest_dtype)
+    cast_var = block.vars.get(cast_name)
+    if cast_var is None or cast_var.dtype != dest_dtype:
+        cast_var = block.create_var(
+            name=cast_name,
+            dtype=dest_dtype,
+            persistable=False,
+            stop_gradient=target_var.stop_gradient)
+        block._insert_op(
+            idx,
+            type="cast",
+            inputs={"X": target_var},
+            outputs={"Out": cast_var},
+            attrs={"in_dtype": target_var.dtype,
+                   "out_dtype": cast_var.dtype})
+        num_cast_ops += 1
+        op_var_rename_map[block.idx][target_var.name] = cast_var.name
+
+    return num_cast_ops
+
+
+def _is_in_fp32_varnames(op, amp_lists):
+    if not amp_lists.fp32_varnames:
+        return False
+
+    for in_name in op.input_arg_names:
+        if in_name in amp_lists.fp32_varnames:
+            return True
+
+    for out_name in op.output_arg_names:
+        if out_name in amp_lists.fp32_varnames:
+            return True
+
+    return False
+
+
+def _need_keep_fp32(op, unsupported_op_list, use_bf16_guard):
+    if op.type in unsupported_op_list:
+        # the highest priority condition: If ops don't have bf16 computing kernels,
+        # they must be executed in fp32 calculation pattern.
+        return True
+
+    # process ops about learning rate
+    in_out_arg_names = []
+    in_out_arg_names.extend(list(op.input_arg_names))
+    in_out_arg_names.extend(list(op.output_arg_names))
+    for name in in_out_arg_names:
+        if "learning_rate" in name:
+            return True
+
+    if use_bf16_guard:
+        if op.has_attr("op_namescope") and \
+                (_bf16_guard_pattern in op.attr("op_namescope")):
+            # op in bf16 guard
+            return False
+        else:
+            # op not in bf16 guard
+            return True
+    else:
+        return False
+
+
+@signature_safe_contextmanager
+def bf16_guard():
+    """
+    As for the pure bf16 training, if users set `use_bf16_guard` to True,
+    only those ops created in the context manager `bf16_guard` will be
+    transformed as float16 type.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+            paddle.enable_static()
+            data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
+            conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
+
+            with paddle.static.amp.bf16_guard():
+                bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
+                pool = F.max_pool2d(bn, kernel_size=2, stride=2)
+                hidden = paddle.static.nn.fc(pool, size=10)
+                loss = paddle.mean(hidden)
+    """
+    with framework.name_scope(prefix=_bf16_guard_pattern):
+        yield
+
+
+def are_post_ops_bf16(post_ops, keep_fp32_ops):
+    for post_op in post_ops:
+        for op in post_op:
+            if op.type in keep_fp32_ops:
+                return False
+    return True
+
+
+def cast_initializers_to_bf16(startup_prog,
+                              amp_lists,
+                              block,
+                              all_ops,
+                              keep_fp32_ops,
+                              to_bf16_var_names=None):
+    prepend_ops = startup_prog.global_block().ops
+    for op in prepend_ops:
+        if str(op.type) in amp_lists.bf16_initializer_list:
+            change_op = True
+            op_post_ops = []
+            op_out_vars = []
+            for out_name in op.output_names:
+                for out_var_name in op.output(out_name):
+                    out_var = block.var(out_var_name)
+                    post_op = find_true_post_op(all_ops, op, out_var_name, True)
+
+                    if out_var is None or out_var.type not in _valid_types:
+                        change_op = False
+                        break
+                    op_post_ops.append(post_op)
+                    op_out_vars.append(out_var)
+
+            if change_op and are_post_ops_bf16(op_post_ops, keep_fp32_ops):
+                for out_var in op_out_vars:
+                    if out_var.dtype == core.VarDesc.VarType.FP32:
+                        out_var.desc.set_dtype(core.VarDesc.VarType.BF16)
+                    if to_bf16_var_names is not None and out_var.name in to_bf16_var_names:
+                        to_bf16_var_names.remove(out_var.name)
+                if op.has_attr('dtype') and op.attr(
+                        'dtype') == core.VarDesc.VarType.FP32:
+                    op._set_attr('dtype', core.VarDesc.VarType.BF16)
+
+
+def cast_model_to_bf16(program,
+                       startup_prog=None,
+                       amp_lists=None,
+                       use_bf16_guard=True):
+    """
+    Traverse all ops in the whole model and set their inputs and outputs
+    to the bf16 data type. This function will do some special processing for
+    the batch normalization, which will keep the batchnorm's computations in FP32.
+    Args:
+        program (Program): The used program.
+        amp_lists (AutoMixedPrecisionListsBF16): An AutoMixedPrecisionListsBF16 object.
+        use_bf16_guard(bool): Determine whether to use `bf16_guard` when
+                              constructing the program. Default True.
+    """
+
+    if amp_lists is None:
+        amp_lists = AutoMixedPrecisionListsBF16()
+    global_block = program.global_block()
+    keep_fp32_ops = set()
+    to_bf16_var_names = set()
+    to_bf16_pre_cast_ops = set()
+    origin_ops = []
+    for block in program.blocks:
+        origin_ops.extend(block.ops)
+
+    for block in program.blocks:
+        ops = block.ops
+        for op in ops:
+            if op.type == 'create_py_reader' or op.type == 'read':
+                continue
+            if _need_keep_fp32(op, amp_lists.unsupported_list, use_bf16_guard):
+                keep_fp32_ops.add(op)
+                continue  # processed below
+            for in_name in op.input_names:
+                if op.type in {
+                        'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+                } and in_name not in {'X', 'Z'}:
+                    continue
+                for in_var_name in op.input(in_name):
+                    in_var = None
+                    try:
+                        in_var = block.var(in_var_name)
+                    except ValueError as e:
+                        _logger.debug(
+                            "-- {}, try to get it in the global block --".
+                            format(e))
+                        in_var = global_block.var(in_var_name)
+                        if in_var is not None:
+                            _logger.debug(
+                                "-- var {} is got in the global block --".
+                                format(in_var_name))
+
+                    if in_var is None or in_var.type not in _valid_types:
+                        continue
+
+                    if in_var.dtype == core.VarDesc.VarType.FP32:
+                        in_var.desc.set_dtype(core.VarDesc.VarType.BF16)
+                        to_bf16_var_names.add(in_var_name)
+
+                    _logger.debug(
+                        "-- op type: {}, in var name: {}, in var dtype: {} --".
+                        format(op.type, in_var_name, in_var.dtype))
+
+            for out_name in op.output_names:
+                if op.type in {
+                        'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+                } and out_name != 'Y':
+                    continue
+                for out_var_name in op.output(out_name):
+                    out_var = None
+                    try:
+                        out_var = block.var(out_var_name)
+                    except ValueError as e:
+                        _logger.debug(
+                            "-- {}, try to get it in the global block --".
+                            format(e))
+                        out_var = global_block.var(out_var_name)
+                        if out_var is not None:
+                            _logger.debug(
+                                "-- var {} is got in the global block --".
+                                format(out_var_name))
+
+                    if out_var is None or out_var.type not in _valid_types:
+                        continue
+
+                    if out_var.dtype == core.VarDesc.VarType.FP32:
+                        out_var.desc.set_dtype(core.VarDesc.VarType.BF16)
+
+                    _logger.debug(
+                        "-- op type: {}, out var name: {}, out var dtype: {} --".
+                        format(op.type, out_var_name, out_var.dtype))
+            for attr_name in ['in_dtype', 'out_dtype', 'dtype']:
+                if op.has_attr(attr_name) and op.attr(
+                        attr_name) == core.VarDesc.VarType.FP32:
+                    op._set_attr(attr_name, core.VarDesc.VarType.BF16)
+            if op.has_attr('use_mkldnn'):
+                op._set_attr('use_mkldnn', True)
+            if op.has_attr('mkldnn_data_type'):
+                op._set_attr('mkldnn_data_type', 'bfloat16')
+
+        if startup_prog is not None:
+            cast_initializers_to_bf16(startup_prog, amp_lists, global_block,
+                                      ops, keep_fp32_ops, to_bf16_var_names)
+
+    # process ops in keep_fp32_ops
+    op_var_rename_map = [
+        collections.OrderedDict() for _ in range(len(program.blocks))
+    ]
+    for block in program.blocks:
+        ops = block.ops
+        idx = 0
+        while idx < len(ops):
+            op = ops[idx]
+            num_cast_ops = 0
+            if op not in keep_fp32_ops:
+                if op in to_bf16_pre_cast_ops:
+                    in_var_cast_num = _insert_cast_op(block, op, idx,
+                                                      core.VarDesc.VarType.FP32,
+                                                      core.VarDesc.VarType.BF16)
+                    num_cast_ops += in_var_cast_num
+            else:
+                pre_cast_num = _insert_cast_op(block, op, idx,
+                                               core.VarDesc.VarType.BF16,
+                                               core.VarDesc.VarType.FP32)
+                num_cast_ops += pre_cast_num
+                for out_var_name in op.output_arg_names:
+                    out_var = block.vars.get(out_var_name)
+                    if out_var is None or out_var.type not in _valid_types:
+                        continue
+                    if out_var.dtype == core.VarDesc.VarType.BF16:
+                        out_var.desc.set_dtype(core.VarDesc.VarType.FP32)
+                        post_ops = find_true_post_op(ops, op, out_var_name)
+                        for post_op in post_ops:
+                            if post_op in keep_fp32_ops:
+                                continue
+                            post_cast_num = _insert_cast_post_op(
+                                block, op, idx + pre_cast_num + 1,
+                                core.VarDesc.VarType.FP32,
+                                core.VarDesc.VarType.BF16, out_var_name,
+                                op_var_rename_map)
+                            num_cast_ops += post_cast_num
+            idx += num_cast_ops + 1
+
+    _rename_op_input(program, op_var_rename_map, origin_ops, keep_fp32_ops)
+    return to_bf16_var_names
+
+
+def cast_parameters_to_bf16(place, program, scope=None, to_bf16_var_names=None):
+    """
+    Traverse all parameters in the whole model and set them to the BF16 data type.
+    Whereas, this function will keep parameters of batchnorms in FP32.
+    Args:
+        place(fluid.CPUPlace|fluid.CUDAPlace): `place` is used to restore the BF16 weight tensors.
+        program (Program): The used program.
+        scope(fluid.Scope, optional): `scope` is used to get the FP32 weight tensor values.
+                                      Default is None.
+        to_bf16_var_names(set|list, optional): The data types of vars in `to_bf16_var_names`
+                                               will be set to BF16. Usually, it is the returned
+                                               value of `cast_model_to_bf16` API.
+    """
+    all_parameters = []
+    for block in program.blocks:
+        all_parameters.extend(block.all_parameters())
+
+    bf16_var_names = to_bf16_var_names if to_bf16_var_names else set()
+    var_scope = scope if scope else global_scope()
+    for param in all_parameters:
+        if param.name in bf16_var_names:
+            _logger.debug("---- cast {} to bf16 dtype ----".format(param.name))
+            param_t = var_scope.find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            param_t.set(convert_float_to_uint16(data), place)
+
+
+def rewrite_program_bf16(main_prog, amp_lists=None):
+    """
+    Traverse all ops in current block and insert cast op according to
+    which set current op belongs to.
+
+    1. When an op belongs to the fp32 list, add it to fp32 set
+    2. When an op belongs to the bf16 list, add it to bf16 set
+    3. When an op belongs to the gray list. If one
+       of its inputs is the output of fp32 set op or fp32 list op,
+       add it to fp32 set. If all of its previous ops are not fp32
+       op and one of its inputs is the output of bf16 set op or
+       bf16 list op, add it to bf16 set.
+    4. When an op isn't in the lists, add it to fp32 op set.
+    5. Add necessary cast ops to make sure that fp32 set op will be
+       computed in fp32 mode, while bf16 set op will be computed in
+       bf16 mode.
+
+    Args:
+        main_prog (Program): The main program for training.
+    """
+    if amp_lists is None:
+        amp_lists = AutoMixedPrecisionListsBF16()
+    block = main_prog.global_block()
+    ops = block.ops
+    bf16_op_set = set()
+    fp32_op_set = set()
+    for op in ops:
+
+        # NOTE(zhiqiu): 'create_py_reader' and 'read' is used in non-iterable DataLoder,
+        # we don't need to handle reader op and the input of 'create_py_reader' is not
+        # in block, which may result in errors.
+        # See GeneratorLoader._init_non_iterable() for details.
+        if op.type == 'create_py_reader' or op.type == 'read':
+            continue
+
+        if amp_lists.fp32_varnames is not None and _is_in_fp32_varnames(
+                op, amp_lists):
+            fp32_op_set.add(op)
+            continue
+
+        if op.type in amp_lists.fp32_list:
+            fp32_op_set.add(op)
+        elif op.type in amp_lists.bf16_list:
+            bf16_op_set.add(op)
+        elif op.type in amp_lists.gray_list:
+            is_fp32_op = False
+            is_bf16_op = False
+            for in_name in op.input_names:
+                # if this op has inputs
+                if in_name:
+                    for in_var_name in op.input(in_name):
+                        in_var = block.var(in_var_name)
+                        # this in_var isn't the output of other op
+                        if in_var.op is None:
+                            continue
+                        elif in_var.op is op:
+                            prev_op = find_true_prev_op(ops, op, in_var_name)
+                            if prev_op is None:
+                                continue
+                        else:
+                            prev_op = in_var.op
+                        # if it's one of inputs
+                        if prev_op in fp32_op_set or \
+                                prev_op.type in amp_lists.fp32_list:
+                            is_fp32_op = True
+                        elif prev_op in bf16_op_set or \
+                                prev_op.type in amp_lists.bf16_list:
+                            is_bf16_op = True
+            if is_fp32_op:
+                fp32_op_set.add(op)
+            elif is_bf16_op:
+                bf16_op_set.add(op)
+            else:
+                pass
+        else:
+            # For numerical safe, we apply fp32 computation on ops that
+            # are not determined which list they should stay.
+            fp32_op_set.add(op)
+
+    idx = 0
+    while idx < len(ops):
+        op = ops[idx]
+        num_cast_ops = 0
+        if op in fp32_op_set:
+            num_cast_ops = _insert_cast_op(block, op, idx,
+                                           core.VarDesc.VarType.BF16,
+                                           core.VarDesc.VarType.FP32)
+        elif op in bf16_op_set:
+            if op.has_attr('use_mkldnn'):
+                op._set_attr('use_mkldnn', True)
+                op._set_attr('mkldnn_data_type', 'bfloat16')
+            elif op.has_attr('dtype') and op.attr(
+                    'dtype') == core.VarDesc.VarType.FP32:
+                op._set_attr('dtype', core.VarDesc.VarType.BF16)
+
+            num_cast_ops = _insert_cast_op(block, op, idx,
+                                           core.VarDesc.VarType.FP32,
+                                           core.VarDesc.VarType.BF16)
+        else:
+            pass
+
+        idx += num_cast_ops + 1
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..32c8a1c3544c229b8f41ec5803a39168816e789a
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import (core, default_main_program, layers, program_guard,
+                          unique_name)
+from .amp_utils import (rewrite_program_bf16, cast_model_to_bf16,
+                        cast_parameters_to_bf16)
+from .amp_lists import AutoMixedPrecisionListsBF16
+import types
+import warnings
+
+__all__ = ["decorate_bf16"]
+
+
+class OptimizerWithMixedPrecision(object):
+    """
+    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
+    optimizer, plus the support of mixed-precision pre-training. The object
+    of this class almost has the same behavior as the common optimizer, with the 
+    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
+    Additionally, it enables the MP training automatically, i.e, the creation 
+    and maintenance of master parameters, scaling of loss, etc.
+
+    Args:
+        optimizer (Optimizer): A common Optimizer object.
+        amp_lists (CustomOpLists): An CustomOpLists object.
+        use_pure_bf16(bool): Whether to use the pure bf16 training.
+        use_bf16_guard(bool): Whether to use `bf16_guard` when constructing the program.
+
+    """
+
+    def __init__(self, optimizer, amp_lists, use_pure_bf16, use_bf16_guard):
+        self._optimizer = optimizer
+        self._amp_lists = amp_lists
+        self._param_grads = None
+        self._train_program = None
+
+        self._learning_rate = optimizer._learning_rate
+        self._learning_rate_map = optimizer._learning_rate_map
+        self._use_pure_bf16 = use_pure_bf16
+        self._use_bf16_guard = use_bf16_guard
+        self._to_bf16_var_names = None
+
+    def _init_amp_var(self):
+        # Ensure the data type of learning rate vars is float32 (same as the
+        # master parameter dtype)
+        if isinstance(self._optimizer._learning_rate, float):
+            self._optimizer._learning_rate_map[default_main_program()] = \
+                    layers.create_global_var(
+                    name=unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=float(self._optimizer._learning_rate),
+                    dtype='float32',
+                    persistable=True)
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        Backward propagation or auto differentiation for gradients' computation.
+
+        Args:
+            loss (Variable): The loss Variable to minimize.
+            startup_program (Program|None): The startup Program for initializing 
+                                       parameters in `parameter_list`.
+            parameter_list (list|None): A list of Variables to update.
+            no_grad_set (set|None): A set of Variables should be ignored.
+            callbacks (list|None): A list of callable objects to run when appending
+                                   backward operator for one parameter.
+
+        Returns:
+            A list of (param, grad), which is a tuple of a parameter and its 
+            gradient respectively, and the scaled loss.
+        """
+        train_program = loss.block.program
+        self._train_program = train_program
+
+        with program_guard(self._train_program, startup_program):
+            self._init_amp_var()
+
+            if self._use_pure_bf16:
+                self._to_bf16_var_names = cast_model_to_bf16(
+                    self._train_program, startup_program, self._amp_lists,
+                    self._use_bf16_guard)
+            else:
+                rewrite_program_bf16(self._train_program, self._amp_lists)
+
+            if loss.dtype != core.VarDesc.VarType.FP32:
+                loss = loss.astype('float32')
+
+            params_grads = self._optimizer.backward(
+                loss, startup_program, parameter_list, no_grad_set, callbacks)
+        return params_grads
+
+    def amp_init(self,
+                 place,
+                 scope=None,
+                 test_program=None,
+                 use_bf16_test=False):
+        """
+        Init the amp training, such as cast fp32 parameters to bf16 type.
+  
+        Args:
+            place(CPUPlace): place is used to initialize 
+                bf16 parameters with fp32 values.
+            scope(Scope): The scope is used to find fp32 parameters.
+            test_program(Program): The program is used for testing.
+            use_bf16_test(bool): Whether to use bf16 testing.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                import paddle.nn.functional as F
+                paddle.enable_static()
+
+                def run_example_code():
+                    place = paddle.CPUPlace(0)
+                    exe = paddle.static.Executor(place)
+                    data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
+                    conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
+                    # 1) Use bf16_guard to control the range of bf16 kernels used.
+                    with paddle.static.amp.bf16_guard():
+                        bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
+                        pool = F.max_pool2d(bn, kernel_size=2, stride=2)
+                        hidden = paddle.static.nn.fc(pool, size=10)
+                        loss = paddle.mean(hidden)
+                    # 2) Create the optimizer and set `multi_precision` to True.
+                    # Setting `multi_precision` to True can avoid the poor accuracy
+                    # or the slow convergence in a way. 
+                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
+                    # 3) These ops in `custom_fp32_list` will keep in the float32 computation type.
+                    amp_list = paddle.static.amp.CustomOpLists(
+                        custom_fp32_list=['pool2d'])
+                    # 4) The entry of Paddle AMP.
+                    # Enable pure bf16 training by setting `use_pure_bf16` to True.
+                    optimizer = paddle.static.amp.bf16.decorate_bf16(
+                        optimizer,
+                        amp_list,
+                        use_pure_bf16=True)
+                    # If you don't use the default_startup_program(), you sholud pass
+                    # your defined `startup_program` into `minimize`.
+                    optimizer.minimize(loss)
+                    exe.run(paddle.static.default_startup_program())
+                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
+                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
+                    optimizer.amp_init(place, scope=paddle.static.global_scope())
+                    
+        """
+        assert self._train_program is not None, \
+            "Please call the minimize method first."
+        if self._use_pure_bf16:
+            cast_parameters_to_bf16(place, self._train_program, scope,
+                                    self._to_bf16_var_names)
+        if test_program is not None:
+            if self._use_pure_bf16:
+                cast_model_to_bf16(
+                    test_program,
+                    amp_lists=self._amp_lists,
+                    use_bf16_guard=self._use_bf16_guard)
+            elif use_bf16_test:
+                rewrite_program_bf16(test_program, amp_lists=self._amp_lists)
+
+    def apply_gradients(self, params_grads):
+        """
+        Apply gradients.
+  
+        Args:
+            params_grads (list): A list of params.
+    
+        Returns:
+            A list of optimize operators.
+        """
+
+        return self._optimizer.apply_gradients(params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        program = loss.block.program
+        with program_guard(program, startup_program):
+            optimize_ops = self.apply_gradients(params_grads)
+        return optimize_ops
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        """
+        Perform optimization by minimizing the given loss.
+
+        Args:
+            loss (Variable): The loss Variable.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameter_list`.
+            parameter_list (list): list of Variables to update.
+            no_grad_set (set|None): set of Variables should be ignored.
+
+        Returns:
+            The scaled loss by scaling factor, the list of optimize ops, and a
+            list of scaled parameters and gradients.
+        """
+        opt_dict = self._optimizer.__class__.__dict__
+        if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
+                                                 types.FunctionType):
+            warnings.warn(
+                "The decorated optimizer has its own `minimize` method, but it will not be executed."
+            )
+
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameter_list=parameter_list,
+            no_grad_set=no_grad_set)
+
+        optimize_ops = self.apply_optimize(loss, startup_program, params_grads)
+
+        return optimize_ops, params_grads
+
+
+def decorate_bf16(optimizer,
+                  amp_lists=None,
+                  use_pure_bf16=False,
+                  use_bf16_guard=None):
+    """ 
+    Decorate the given optimizer to adapt to the mixed-precision training.
+
+    Args:
+        optimizer(Optimizer): A common Optimizer.
+        amp_lists (CustomOpLists): An CustomOpLists object.
+        use_pure_bf16(bool): Whether to use the pure bf16 training. Default False.
+        use_bf16_guard(bool): Whether to use `bf16_guard` when constructing the program.
+                           Default None, which means that its value equals to `use_pure_bf16`.
+
+    Returns:
+        An optimizer acting like a normal one but with mixed-precision training 
+        enabled.
+
+    Examples 1:
+	    .. code-block:: python
+
+            # fp32&bf16 list based strategy example
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            data = static.data(name='X', shape=[None, 1], dtype='float32')
+            hidden = static.nn.fc(x=data, size=10)
+            loss = paddle.mean(hidden)
+            optimizer = paddle.optimizer.Adam(learning_rate=0.001)
+
+            mp_optimizer = static.amp.decorate_bf16(optimizer=optimizer)
+
+            ops, param_grads = mp_optimizer.minimize(loss)
+
+    Examples 2:
+        .. code-block:: python
+
+            # pure bf16 training example
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+
+            def run_example_code():
+                place = paddle.CPUPlace(0)
+                exe = paddle.static.Executor(place)
+                data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
+                conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
+                # 1) Use bf16_guard to control the range of bf16 kernels used.
+                with paddle.static.amp.bf16_guard():
+                    bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
+                    pool = F.max_pool2d(bn, kernel_size=2, stride=2)
+                    hidden = paddle.static.nn.fc(pool, size=10)
+                    loss = paddle.mean(hidden)
+                # 2) Create the optimizer and set `multi_precision` to True.
+                # Setting `multi_precision` to True can avoid the poor accuracy
+                # or the slow convergence in a way. 
+                optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
+                # 3) These ops in `custom_fp32_list` will keep in the float32 computation type.
+                amp_list = paddle.static.amp.CustomOpLists(
+                    custom_fp32_list=['pool2d'])
+                # 4) The entry of Paddle AMP.
+                # Enable pure bf16 training by setting `use_pure_bf16` to True.
+                optimizer = paddle.static.amp.decorate_bf16(
+                    optimizer,
+                    amp_list,
+                    use_pure_bf16=True)
+                # If you don't use the default_startup_program(), you sholud pass
+                # your defined `startup_program` into `minimize`.
+                optimizer.minimize(loss)
+                exe.run(paddle.static.default_startup_program())
+                # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
+                # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
+                optimizer.amp_init(place, scope=paddle.static.global_scope())
+                
+    """
+    if amp_lists is None:
+        amp_lists = AutoMixedPrecisionListsBF16()
+
+    if use_bf16_guard is None:
+        use_bf16_guard = use_pure_bf16
+
+    mp_optimizer = OptimizerWithMixedPrecision(optimizer, amp_lists,
+                                               use_pure_bf16, use_bf16_guard)
+
+    return mp_optimizer
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index d37e90b4695d03b5c9caa71c65c8624e558d1065..3cb9fe75559b1615f2ed1a01bd31742c2996e090 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -29,6 +29,7 @@ from .amp_nn import check_finite_and_unscale
 from .amp_nn import update_loss_scaling
 import types
 import warnings
+import paddle
 
 __all__ = ["decorate"]
 
@@ -98,6 +99,7 @@ class OptimizerWithMixedPrecision(object):
     def get_loss_scaling(self):
         """Return the real-time loss scaling factor.
         """
+        assert self._loss_scaling is not None, 'Please call minimize() before calling get_loss_scaling().'
         return self._loss_scaling
 
     def get_scaled_loss(self):
@@ -164,6 +166,17 @@ class OptimizerWithMixedPrecision(object):
         train_program = loss.block.program
         self._train_program = train_program
 
+        # NOTE(zhiqiu): _float_status is only used for NPU.
+        if core.is_compiled_with_npu():
+            float_status = paddle.static.data(
+                name="float_status", shape=[8], dtype='float32')
+            self._train_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            self._float_status = float_status
+        else:
+            self._float_status = None
+
         with program_guard(self._train_program, startup_program):
             self._init_amp_var()
 
@@ -293,7 +306,10 @@ class OptimizerWithMixedPrecision(object):
             for p, g in params_grads:
                 with self._train_program._optimized_guard([p, g]):
                     _, found_inf = check_finite_and_unscale(
-                        [g, ], self._loss_scaling, name="find_infinite_scale")
+                        [g, ],
+                        self._loss_scaling,
+                        name="find_infinite_scale",
+                        float_status=self._float_status)
                     found_infs.append(found_inf)
         elif self._use_pure_fp16:
             if fp32_grads:
@@ -301,19 +317,24 @@ class OptimizerWithMixedPrecision(object):
                     _, fp32_found_inf = check_finite_and_unscale(
                         fp32_grads,
                         self._loss_scaling,
-                        name="find_infinite_scale_fp32")
+                        name="find_infinite_scale_fp32",
+                        float_status=self._float_status)
                 found_infs.append(fp32_found_inf)
             if fp16_grads:
                 with self._train_program._optimized_guard(fp16_grads):
                     _, fp16_found_inf = check_finite_and_unscale(
                         fp16_grads,
                         self._loss_scaling,
-                        name="find_infinite_scale_fp16")
+                        name="find_infinite_scale_fp16",
+                        float_status=self._float_status)
                 found_infs.append(fp16_found_inf)
         else:
             with self._train_program._optimized_guard(grads):
                 _, found_inf = check_finite_and_unscale(
-                    grads, self._loss_scaling, name="find_infinite_scale")
+                    grads,
+                    self._loss_scaling,
+                    name="find_infinite_scale",
+                    float_status=self._float_status)
 
         if self._use_dynamic_loss_scaling:
             if self._is_distributed or self._use_pure_fp16:
@@ -393,6 +414,7 @@ class OptimizerWithMixedPrecision(object):
             The scaled loss by scaling factor, the list of optimize ops, and a
             list of scaled parameters and gradients.
         """
+
         opt_dict = self._optimizer.__class__.__dict__
         if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
                                                  types.FunctionType):
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index c88ae2d9cbf603b47252e071e2fca4196902430c..f940f6a3143a09fa82d4e10fba38f7d86b9c025d 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import copy
+from ... import core
 
 __all__ = ["CustomOpLists", "AutoMixedPrecisionLists"]
 
@@ -69,7 +70,7 @@ class AutoMixedPrecisionLists(object):
                 self.unsupported_list.add(op_name)
 
 
-# The three sets listed below are changed dynamiclly. They don't contain all  
+# The three sets listed below are changed dynamiclly. They don't contain all
 # paddle ops currently.
 
 # The set of ops that support fp16 calculation and are considered numerically-
@@ -147,147 +148,10 @@ gray_list = {
 }
 
 # The set of ops that don't support fp16 calculation
-unsupported_fp16_list = {
-    # from python/paddle/fluid/layers/io.py
-    'send',
-    'send_barrier',
-    'recv',
-    'fetch_barrier',
-    'create_py_reader',
-    'create_double_buffer_reader',
-    'read',
-    'load',
-
-    # from python/paddle/fluid/control_flow.py
-    'increment',
-    'less_than',
-    'less_equal',
-    'greater_than',
-    'greater_equal',
-    'equal',
-    'not_equal',
-    'read_from_array',
-    'shrink_rnn_memory',
-    'lod_array_length',
-    'logical_and',
-    'logical_or',
-    'logical_xor',
-    'logical_not',
-    'print',
-    'conditional_block',
-    'while',
-    'ifelse',
-    'is_empty',
-    'lstm',
-    'cudnn_lstm',
-    'lstmp',
-    'gru',
-    'gru_unit',
-    'linear_chain_crf',
-    'crf_decoding',
-    'bpr_loss',
-    'chunk_eval',
-    'sequence_conv',
-    'sequence_softmax',
-    # Depthwise conv2d isn't fast and safe currently.
-    # ref: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h#L79
-    'depthwise_conv2d',
-    # Tensor Core kernels are not available for 3D convolutions currently.
-    'conv3d',
-    'sequence_pool',
-    'sequence_concat',
-    'sequence_slice',
-    'data_norm',
-    'group_norm',
-    'spectral_norm',
-    'depthwise_conv2d_transpose',
-    'sequence_expand',
-    'conv_transposed2d',
-    'conv_transposed3d',
-    'sequence_expand_as',
-    'sequence_pad',
-    'sequence_unpad',
-    'sequence_erase',
-    'beam_search',
-    'beam_search_decode',
-    'lstm_unit',
-    'reduce_sum',
-    'reduce_mean',
-    'reduce_max',
-    'reduce_min',
-    'reduce_prod',
-    'reduce_all',
-    'reduce_any',
-    'split',
-    'edit_distance',
-    'ctc_align',
-    'warpctc',
-    'sequence_reshape',
-    'nce',
-    'hierarchical_sigmoid',
-    'im2sequence',
-    'row_conv',
-    'multiplex',
-    'sample_logits',
-    'one_hot',
-    'smooth_l1_loss',
-    'squeeze2',
-    'unsqueeze2',
-    'lod_reset',
-    'lrn',
-    'pad',
-    'pad_constant_like',
-    'label_smooth',
-    'scatter',
-    'sequence_scatter',
-    'random_crop',
-    'mean_iou',
-    'selu',
-    'crop',
-    'affine_grid',
-    'rank_loss',
-    'margin_rank_loss',
-    'pad2d',
-    'elu',
-    'pow',
-    'stanh',
-    'hard_sigmoid',
-    'swish',
-    'prelu',
-    'brelu',
-    'sequence_enumerate',
-    'sequence_mask',
-    'expand',
-    'sampling_id',
-    'maxout',
-    'space_to_depth',
-    'sequence_reverse',
-    'similarity_focus',
-    'hash',
-    'grid_sampler',
-    'log_loss',
-    'teacher_student_sigmoid_loss',
-    'add_position_encoding',
-    'bilinear_tensor_product',
-    'shuffle_channel',
-    'temporal_shift',
-    'psroi_pool',
-    'huber_loss',
-    'kldiv_loss',
-    'tree_conv',
-    'pixel_shuffle',
-    'fsp',
-    'cvm',
-    'affine_channel',
-    'roi_pool',
-    'roi_align',
-    'anchor_generator',
-    'generate_proposals',
-    'generate_proposal_labels',
-    'generate_mask_labels',
-    # fp16 is slower than fp32, though fp16 is supported.
-    'lookup_table',
-    'lookup_table_v2',
-}
+# lookup_table fp16 is slower than fp32, though fp16 is supported.
+_, _, _sys_unsupported_fp16_list = core.op_supported_infos(
+    'GPU', core.VarDesc.VarType.FP16)
+unsupported_fp16_list = {'lookup_table',
+                         'lookup_table_v2'} | _sys_unsupported_fp16_list
 
 CustomOpLists = AutoMixedPrecisionLists
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index f9c3a613c4053a79cb467d752b20f6f4ed3ea4ec..16dfb2bd50c1413962b6e75d97eb855d06058517 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -103,7 +103,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
             if in_name not in {'X', 'Z'}:
                 continue
         for in_var_name in op.input(in_name):
-            in_var = block.var(in_var_name)
+            in_var = block._find_var_recursive(in_var_name)
             if in_var.type not in _valid_types or in_var.dtype == dest_dtype:
                 continue
             if in_var.dtype == src_dtype:
@@ -116,14 +116,15 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         persistable=False,
                         stop_gradient=in_var.stop_gradient)
 
-                    block._insert_op(
+                    block._insert_op_without_sync(
                         idx,
                         type="cast",
                         inputs={"X": in_var},
                         outputs={"Out": out_var},
                         attrs={
                             "in_dtype": in_var.dtype,
-                            "out_dtype": out_var.dtype
+                            "out_dtype": out_var.dtype,
+                            "op_device": op.attr("op_device")
                         })
                     num_cast_ops += 1
                 _rename_arg(op, in_var.name, out_var.name)
@@ -156,7 +157,8 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
         return num_cast_ops
 
     assert target_var.dtype == src_dtype, \
-           "The real dtype({}) is not equal to the src dtype({})".format(_dtype_to_str(target_var.dtype), _dtype_to_str(src_dtype))
+        "The real dtype({}) is not equal to the src dtype({})".format(
+            _dtype_to_str(target_var.dtype), _dtype_to_str(src_dtype))
 
     cast_name = target_var.name + '.cast_' + _dtype_to_str(dest_dtype)
     cast_var = block.vars.get(cast_name)
@@ -171,8 +173,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
             type="cast",
             inputs={"X": target_var},
             outputs={"Out": cast_var},
-            attrs={"in_dtype": target_var.dtype,
-                   "out_dtype": cast_var.dtype})
+            attrs={
+                "in_dtype": target_var.dtype,
+                "out_dtype": cast_var.dtype,
+                "op_device": op.attr("op_device")
+            })
         num_cast_ops += 1
         op_var_rename_map[block.idx][target_var.name] = cast_var.name
 
@@ -205,7 +210,7 @@ def find_true_prev_op(ops, cur_op, var_name):
     return None
 
 
-def find_true_post_op(ops, cur_op, var_name):
+def find_true_post_op(ops, cur_op, var_name, search_all=False):
     """
     if there are post ops, return them, if there is no post op,
     return None instead.
@@ -213,11 +218,22 @@ def find_true_post_op(ops, cur_op, var_name):
         ops (list): A list of ops.
         cur_op (Operator): Current operator which has var_name variable.
         var_name (string): Variable name.
+        search_all (bool): The type of operator search. Use if \"cur_op\" is not in the \"ops\" set. 
     """
     post_op = []
-    for idx, op in enumerate(ops):
-        if op == cur_op:
-            break
+    if search_all:
+        """
+        \"cur_op\" do not have to be in list of \"ops\". E.g. \"cur_op\" can come 
+        from startup_prog block and \"ops\" list from main_prog block. 
+        By setting idx to -1, we'll start looking for post-ops from the top of the list. 
+        If search_all is False, assume that \"cur_op\" is in \"ops\" list, 
+        so to reduce the time of search we can start iterating from \"cur_op\" idx. 
+        """
+        idx = -1
+    else:
+        for idx, op in enumerate(ops):
+            if op == cur_op:
+                break
 
     for i in range(idx + 1, len(ops)):
         op = ops[i]
@@ -266,7 +282,7 @@ def _need_keep_fp32(op, unsupported_op_list, use_fp16_guard):
 
     if use_fp16_guard:
         if op.has_attr("op_namescope") and \
-            (_fp16_guard_pattern in op.attr("op_namescope")):
+                (_fp16_guard_pattern in op.attr("op_namescope")):
             # op in fp16 guard
             return False
         else:
@@ -486,13 +502,14 @@ def rewrite_program(main_prog, amp_lists):
         main_prog (Program): The main program for training.
     """
     block = main_prog.global_block()
+    block._sync_with_cpp()
     ops = block.ops
     white_op_set = set()
     black_op_set = set()
     for op in ops:
 
-        # NOTE(zhiqiu): 'create_py_reader' and 'read' is used in non-iterable DataLoder, 
-        # we don't need to handle reader op and the input of 'create_py_reader' is not 
+        # NOTE(zhiqiu): 'create_py_reader' and 'read' is used in non-iterable DataLoder,
+        # we don't need to handle reader op and the input of 'create_py_reader' is not
         # in block, which may result in errors.
         # See GeneratorLoader._init_non_iterable() for details.
         if op.type == 'create_py_reader' or op.type == 'read':
@@ -574,6 +591,7 @@ def update_role_var_grad(main_prog, params_grads):
         params_grads (list): A list of params and grads.
     """
     block = main_prog.global_block()
+    block._sync_with_cpp()
     BACKWARD = core.op_proto_and_checker_maker.OpRole.Backward
     OPTIMIZE = core.op_proto_and_checker_maker.OpRole.Optimize
     for p, g in params_grads:
@@ -581,7 +599,7 @@ def update_role_var_grad(main_prog, params_grads):
         if g.dtype == core.VarDesc.VarType.FP32 and op.type == 'cast':
             role = op.attr('op_role')
             if role & int(BACKWARD) and op.has_attr('op_role_var'):
-                op.desc.remove_attr("op_role_var")
+                op._remove_attr("op_role_var")
             else:
                 raise ValueError("The cast op {0} must be in BACKWARD role "
                                  "and have op_role_var attr.".format(op))
@@ -606,11 +624,19 @@ def update_role_var_grad(main_prog, params_grads):
                 raise ValueError("The cast op {0}'s output should not be"
                                  "used by a non-optimize op, however, it"
                                  "is used by {1}".format(op, post_ops[0]))
+            # add new op in the python and cpp at the same time
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(op.desc)
-
+            new_op = framework.Operator(
+                block=block,
+                desc=new_op_desc,
+                type=None,
+                inputs=None,
+                outputs=None,
+                attrs=None)
+            block.ops.append(new_op)
             op_idx = find_op_index(block.desc, op.desc)
             if op_idx == -1:
                 raise ValueError("The op {0} is not in program".format(op))
-            block.desc._remove_op(op_idx, op_idx + 1)
-        block._sync_with_cpp()
+            block._remove_op(op_idx, sync=False)
+    block._sync_with_cpp()
diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md
deleted file mode 100644
index f043a17493ec2b77ae2910ebc02744db42d77dfb..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/reader/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-## CTR READER
-
-An multi-thread cpp reader that has the same interface with py_reader. It
-uses cpp multi-thread to read file and is much more faster then the Python read
-thread in py_reader.
-
-Currently, it support two types of file:
- - gzip
- - plain text file
-
-and two types of data format:
- - cvs data format is :
-   * label dense_fea,dense_fea sparse_fea,sparse_fea
- - the svm data format is :
-   * label slot1:fea_sign slot2:fea_sign slot1:fea_sign
-
-## Distributed reader
-
-The distributed reader is mainly used by multi-process tasks, and the input must be a batch reader.
-
-Cons:
-  - It can be operated conveniently so that different processes can read different data.
-
-Pros:
-  - If batch_reader produces training data, and batch_reader loads or preprocesses data for a long time, this data reading method may be slower.
diff --git a/python/paddle/fluid/contrib/reader/distributed_reader.py b/python/paddle/fluid/contrib/reader/distributed_reader.py
deleted file mode 100644
index ecee769218f5474cc5489c51bdc1f443833e66e8..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/reader/distributed_reader.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-
-__all__ = ["distributed_batch_reader"]
-
-
-def distributed_batch_reader(batch_reader):
-    """
-    Create a reader for multi-process training. The input must be a batch reader.
-
-    Args:
-        batch_reader (callable): The input reader should be a batch reader.
-
-    Examples:
-
-    .. code-block:: python
-           import paddle
-           import paddle.fluid as fluid
-
-           train_reader = paddle.batch(paddle.dataset.mnist.train(),
-                    batch_size=32,drop_last=True)
-           train_reader = fluid.contrib.reader.distributed_batch_reader(
-                    train_reader)
-
-    """
-    trainers_num = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
-    assert trainer_id < trainers_num
-
-    def decorate_for_multi_process():
-        if trainers_num > 1:
-            print("start data reader (trainers_num: {}, trainer_id: {})".format(
-                trainers_num, trainer_id))
-
-        train_data, idx = None, 1
-        for batch_id, data in enumerate(batch_reader()):
-            if trainers_num > 1:
-                if idx < trainers_num:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    idx += 1
-                else:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    assert train_data is not None, "train data should not be None."
-                    yield train_data
-                    train_data, idx = None, 1
-            else:
-                yield data
-
-    return decorate_for_multi_process
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index c5ee9ea6751003c19ef5b43f1af0f09093bded89..66b11d1f17ad412de616f7053665a2045c09359e 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -17,105 +17,98 @@ import logging
 import numpy as np
 import sys
 import os
+import warnings
+
 import paddle
-from paddle.fluid import dygraph, core, framework
-from paddle.fluid.executor import Executor
+from paddle.fluid import dygraph, core, framework, unique_name
+from paddle.fluid.executor import Executor, global_scope
+from paddle.fluid.param_attr import ParamAttr
+from paddle.fluid.initializer import Constant
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn import Linear, Conv2D, Conv2DTranspose, MaxPool2D, MaxPool1D, BatchNorm1D, BatchNorm2D, BatchNorm3D
-from paddle.fluid.dygraph.nn import BatchNorm, Pool2D
 from paddle.fluid.io import load_inference_model, save_inference_model
-from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6, Tanh, Softmax, PReLU, Swish
 from paddle.fluid.log_helper import get_logger
-from . import quant_nn
 from .. import quantization_pass
+from . import quant_nn
+from . import utils
 
-__all__ = ['ImperativeQuantAware', 'ImperativeCalcOutScale']
+__all__ = ['ImperativeQuantAware']
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
-_op_real_in_out_name = {
-    "conv2d": [["Input", "Filter"], ["Output"]],
-    "depthwise_conv2d": [["Input", "Filter"], ["Output"]],
-    "pool2d": [["X"], ["Out"]],
-    "elementwise_add": [["X", "Y"], ["Out"]],
-    "softmax": [["X"], ["Out"]],
-    "relu": [["X"], ["Out"]],
-    "relu6": [["X"], ["Out"]],
-    "leaky_relu": [["X"], ["Out"]],
-    "prelu": [["X"], ["Out"]],
-    "tanh": [["X"], ["Out"]],
-    "batch_norm": [["X"], ["Y"]],
-    "sigmoid": [["X"], ["Out"]],
-    "swish": [["X"], ["Out"]],
-}
-
 
 class ImperativeQuantAware(object):
     """
-    Add the fake quant logic for given quantizable layers, namely add the quant_dequant
-    computational logic both for activation inputs and weight inputs.
+    Applying quantization aware training (QAT) to dgraph model.
     """
 
     def __init__(self,
-                 weight_bits=8,
-                 activation_bits=8,
+                 quantizable_layer_type=['Conv2D', 'Linear'],
                  weight_quantize_type='abs_max',
                  activation_quantize_type='moving_average_abs_max',
+                 weight_bits=8,
+                 activation_bits=8,
                  moving_rate=0.9,
-                 quantizable_layer_type=['Conv2D', 'Linear'],
                  weight_preprocess_layer=None,
                  act_preprocess_layer=None,
                  weight_quantize_layer=None,
                  act_quantize_layer=None):
-        r"""
+        """
         The constructor for ImperativeQuantAware.
 
         Args:
-            weight_bits(int): quantization bit number for weights,
-                whereas the bias is not quantized.
-            activation_bits(int): quantization bit number for activations.
+            quantizable_layer_type(list[str | layer]): List the type of
+                layers that will be quantized. Default is ['Conv2D', 'Linear'].
             weight_quantize_type(str): quantization type for weights,
-                which supports 'abs_max' now. The 'moving_average_abs_max'
-                usually is not used for weights, since weights are fixed once the
-                model is well trained.
+                which supports 'abs_max' and 'channel_wise_abs_max'.
             activation_quantize_type(str): quantization type for activations,
                 which supports 'abs_max' and 'moving_average_abs_max' now.
-                If using 'abs_max' mode, the quantization scale will be calculated
-                dynamically each step in both training and testing period. If using
-                'moving_average_abs_max', the static quantization scale will be calculated
-                during training and used in inference.
-            moving_rate(float): the parameter for 'moving_average_abs_max' quantization.
-            quantizable_layer_type(list[str]): List the type of layers that will be quantized. 
-                Default is ['Conv2D', 'Linear']. The quantizable_op_type in
-                QuantizationFreezePass and ConvertToInt8Pass must be the same as this.
-            weight_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to preprocess
-                weight before quantization. Using this can quickly test if user's
-                preprocess method works or not. The input is non-quantized
-                weight and function returns processed weight to be quantized.
-                If None, the weight will be quantized directly. Default is None.
-            act_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to preprocess
-                activation before quantization. Using this can quickly test if user's
-                preprocess method works or not. The input is non-quantized
-                activation and function returns processed activation to be quantized.
-                If None, the activation will be quantized directly. Default is None.
-            weight_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to quantize weight.
+                If using 'abs_max' mode, the quantization scale will be
+                calculated dynamically each step in both training and testing
+                period. If using 'moving_average_abs_max', the static
+                quantization scale will be calculated during training and
+                used in inference.
+            weight_bits(int): quantization bit number for weights, whereas
+                the bias is not quantized.
+            activation_bits(int): quantization bit number for activations.
+            moving_rate(float): the parameter for 'moving_average_abs_max'
+                quantization.
+            weight_preprocess_layer(paddle.nn.Layer, optional): A paddle
+                Layer that defines how to preprocess weight before quantization.
+                Using this can quickly test if user's preprocess method works
+                or not. The input is non-quantized weight and function returns
+                processed weight to be quantized.
+                If None, the weight will be quantized directly.
+                Default is None.
+            act_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer
+                that defines how to preprocess activation before quantization.
+                Using this can quickly test if user's preprocess method works
+                or not. The input is non-quantized activation and function returns
+                processed activation to be quantized.
+                If None, the activation will be quantized directly.
+                Default is None.
+            weight_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that
+                defines how to quantize weight.
                 Using this can quickly test if user's quantization method works or not.
                 In this layer, user should both define quantization method and
                 dequantization method, that is, the function's input is non-quantized
-                weight and returns dequantized weight. If None, will use
-                quantization op defined by 'weight_quantize_type'. Default is None.
-            act_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to quantize activation.
+                weight and returns dequantized weight.
+                If None, will use uantization op defined by 'weight_quantize_type'.
+                Default is None.
+            act_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines
+                how to quantize activation.
                 Using this can quickly test if user's quantization method works or not.
                 In this layer, user should both define quantization method and
                 dequantization method, that is, the function's input is non-quantized
-                activation and returns dequantized activation. If None, will use
-                quantization op defined by 'activation_quantize_type'. Default is None.
+                activation and returns dequantized activation. 
+                If None, will use quantization op defined by 'activation_quantize_type'.
+                Default is None.
 
         Note:
-            If user sets attribute 'skip_quant' to a Layer that support dynamic quantization and sets
-            it to true, the layer would not be quantized during training. If this attribute is not sets
-            or the attribute is false, the Layer would be qunatized in training.
+            If user sets attribute 'skip_quant' to a Layer that support dynamic
+            quantization and sets it to true, the layer would not be quantized
+            during training. If this attribute is not sets or the attribute is
+            false, the Layer would be qunatized in training.
 
         Examples 1:
         .. code-block:: python
@@ -192,168 +185,214 @@ class ImperativeQuantAware(object):
                 model_path="./imperative_model_qat")
         """
         super(ImperativeQuantAware, self).__init__()
-        self._weight_bits = weight_bits
-        self._activation_bits = activation_bits
-        self._moving_rate = moving_rate
-        self._activation_quantize_type = activation_quantize_type
-        self._weight_quantize_type = weight_quantize_type
-
-        self._weight_pre_layer = weight_preprocess_layer
-        self._act_pre_layer = act_preprocess_layer
-        self._weight_quant_layer = weight_quantize_layer
-        self._act_quant_layer = act_quantize_layer
-        self._out_scale = ImperativeCalcOutScale()
-
-        t_check = lambda method: method is None or issubclass(method, dygraph.layers.Layer)
-        assert t_check(
-            self._weight_pre_layer), "weight_preprocess should be nn.Layer"
-        assert t_check(self._act_pre_layer), "act_preprocess should be nn.Layer"
-        assert t_check(
-            self._weight_quant_layer), "weight_quantize should be nn.Layer"
-        assert t_check(self._act_quant_layer), "act_quantize should be nn.Layer"
-
-        quant_type = {
-            'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
-        }
 
-        assert activation_quantize_type != 'channel_wise_abs_max', \
-            "The activation quantization type does not support 'channel_wise_abs_max'."
-        if activation_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown activation_quantize_type : '%s'. It can only be "
-                "'abs_max' or 'moving_average_abs_max' now." %
-                (str(activation_quantize_type)))
-        if weight_quantize_type not in quant_type:
-            raise ValueError(
-                "Unknown weight_quantize_type: '%s'. It can only be "
-                "'abs_max' or 'moving_average_abs_max' or 'channel_wise_abs_max' now."
-                % (str(weight_quantize_type)))
-
-        self._quant_layers_map = {
-            'Conv2D': Conv2D,
-            'Linear': Linear,
-            'Pool2D': Pool2D,
-            'ReLU': ReLU,
-            'LeakyReLU': LeakyReLU,
-            'ReLU6': ReLU6,
-            'Softmax': Softmax,
-            'Tanh': Tanh,
-            'Swish': Swish
+        kwargs = {
+            "quantizable_layer_type": quantizable_layer_type,
+            "weight_quantize_type": weight_quantize_type,
+            "activation_quantize_type": activation_quantize_type,
+            "weight_bits": weight_bits,
+            "activation_bits": activation_bits,
+            "moving_rate": moving_rate,
+            "weight_preprocess_layer": weight_preprocess_layer,
+            "act_preprocess_layer": act_preprocess_layer,
+            "weight_quantize_layer": weight_quantize_layer,
+            "act_quantize_layer": act_quantize_layer
         }
-        self._quantizable_layer_type = tuple(
-            self._quant_layers_map[layer]
-            if layer in self._quant_layers_map else layer
-            for layer in quantizable_layer_type)
-        for layer in self._quantizable_layer_type:
-            assert not isinstance(
-                layer, str), "{} is unspported to be quantized.".format(layer)
+
+        self._quantize_inputs = ImperativeQuantizeInputs(**kwargs)
+
+        self._quantize_outputs = ImperativeQuantizeOutputs()
 
     def quantize(self, model):
         """
-        According to weights' and activations' quantization types, the model will be added some fake
-        quant ops, such as fake_quantize_dequantize_moving_average_abs_max, fake_quantize_dequantize_abs_max
-        and so on. At the same time, the out_scale value of outputs would be calculated.
+        According to weights' and activations' quantization types,
+        the model will be added some fake quant ops, such as
+        fake_quantize_dequantize_moving_average_abs_max,
+        fake_quantize_dequantize_abs_max and so on. At the same time,
+        the out_scale value of outputs would be calculated.
 
         Args:
             model(fluid.dygraph.Layer): the model to be quantized.
         Returns:
             None
         """
+        assert isinstance(model, dygraph.Layer), \
+            "The model must be the instance of dygraph.Layer."
+        self._quantize_inputs.apply(model)
+        self._quantize_outputs.apply(model)
+
+    def save_quantized_model(self, layer, path, input_spec=None, **config):
+        self._quantize_outputs.save_quantized_model(layer, path, input_spec,
+                                                    **config)
+
+
+class ImperativeQuantizeInputs(object):
+    """
+    Based on the input params, add the quant_dequant computational
+    logic both for activation inputs and weight inputs.
+    """
+
+    def __init__(self,
+                 quantizable_layer_type=['Conv2D', 'Linear'],
+                 weight_quantize_type='abs_max',
+                 activation_quantize_type='moving_average_abs_max',
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 weight_preprocess_layer=None,
+                 act_preprocess_layer=None,
+                 weight_quantize_layer=None,
+                 act_quantize_layer=None):
+        """
+        The constructor for ImperativeQuantizeInputs. 
+
+        Please refer to the args of ImperativeQuantAware.
+        """
+        super(ImperativeQuantizeInputs, self).__init__()
+
+        self._quantizable_layer_type = tuple(
+            utils.quant_input_layers_map[layer]
+            if layer in utils.quant_input_layers_map else layer
+            for layer in quantizable_layer_type)
+        for layer in self._quantizable_layer_type:
+            assert not isinstance(layer, str), \
+                "%s is unspported to be quantized." % layer
+
+        quantize_type = {
+            'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
+        }
+        assert weight_quantize_type in quantize_type, \
+            "Unsupported weight_quantize_type: %s. It can only " \
+            "be abs_max or moving_average_abs_max or " \
+            "channel_wise_abs_max." % weight_quantize_type
+        assert activation_quantize_type != 'channel_wise_abs_max' \
+            and activation_quantize_type in quantize_type, \
+            "Unsupported activation_quantize_type: %s. It can " \
+            "only be abs_max or moving_average_abs_max now." \
+            % activation_quantize_type
+
+        bits_check = lambda bits: isinstance(bits, int) \
+            and bits >= 0 and bits <= 16
+        assert bits_check(weight_bits), \
+            "weight_bits should be 1, 2,... or 16."
+        assert bits_check(activation_bits), \
+            "activation_bits should be 1, 2,... or 16."
+
+        layer_check = lambda method: method is None or \
+            issubclass(method, dygraph.layers.Layer)
+        assert layer_check(weight_preprocess_layer), \
+            "weight_preprocess should be nn.Layer."
+        assert layer_check(act_preprocess_layer), \
+            "act_preprocess should be nn.Layer."
+        assert layer_check(weight_quantize_layer), \
+            "weight_quantize should be nn.Layer."
+        assert layer_check(act_quantize_layer), \
+            "act_quantize should be nn.Layer."
+
+        self._kwargs = {
+            "weight_quantize_type": weight_quantize_type,
+            "activation_quantize_type": activation_quantize_type,
+            "weight_bits": weight_bits,
+            "activation_bits": activation_bits,
+            "moving_rate": moving_rate,
+            "weight_pre_layer": weight_preprocess_layer,
+            "act_pre_layer": act_preprocess_layer,
+            "weight_quant_layer": weight_quantize_layer,
+            "act_quant_layer": act_quantize_layer
+        }
+
+    def apply(self, model):
+        assert isinstance(model, dygraph.Layer), \
+            "The model must be the instance of dygraph.Layer."
+
         for name, layer in model.named_sublayers():
-            if not isinstance(layer, self._quantizable_layer_type):
-                continue
-            if hasattr(layer, "skip_quant") and layer.skip_quant == True:
+            if not isinstance(layer, self._quantizable_layer_type) \
+                or (hasattr(layer, "skip_quant") \
+                    and layer.skip_quant == True):
                 continue
 
+            # TODO(jc): optimize this module
             last_idx = 0
             idx = 0
             obj = model
-            parent = model
-
             while idx < len(name):
                 if (name[idx] == '.'):
-                    if hasattr(parent, name[last_idx:idx]):
+                    if hasattr(obj, name[last_idx:idx]):
                         obj = getattr(obj, name[last_idx:idx])
-                        parent = obj
                         last_idx = idx + 1
                 idx += 1
             target = name[last_idx:idx]
 
-            quant_layer = self._get_quantized_counterpart(layer)
-            setattr(quant_layer, "layer_name", layer.full_name())
+            quant_layer = self._get_input_quantized_layer(layer)
             setattr(obj, target, quant_layer)
 
-        self._out_scale.calc_out_scale(model)
-
-    def _get_quantized_counterpart(self, layer):
-        quant_layers = tuple(self._quant_layers_map.values())
-        quantized_counterpart = tuple('Quantized' + k
-                                      for k in self._quant_layers_map.keys())
-
-        predicate = lambda value: isinstance(layer, value)
-        index_generator = (i for i, v in enumerate(quant_layers)
-                           if predicate(v))
-
-        try:
-            index = next(index_generator)
-        except StopIteration:
-            _logger.fatal("The layer {} is unsupported to be quantized.".format(
-                layer.full_name()))
-            sys.exit(-1)
+    def _get_input_quantized_layer(self, layer):
+        quant_layer_name = None
+        for key, value in utils.quant_input_layers_map.items():
+            if isinstance(layer, value):
+                quant_layer_name = 'Quantized' + key
+                break
+        assert quant_layer_name is not None, \
+            "The layer %s is unsupported to be quantized." \
+            % layer.full_name()
 
         layer_with_weight = ['QuantizedConv2D', 'QuantizedLinear']
-        if quantized_counterpart[index] not in layer_with_weight:
-            quant_layer_class_name = 'QuantizedNoweightLayer'
-        else:
-            quant_layer_class_name = quantized_counterpart[index]
-        quantized_layer = quant_nn.__dict__[quant_layer_class_name](
-            layer, self._weight_bits, self._activation_bits, self._moving_rate,
-            self._weight_quantize_type, self._activation_quantize_type,
-            self._weight_pre_layer, self._act_pre_layer,
-            self._weight_quant_layer, self._act_quant_layer)
-        return quantized_layer
+        if quant_layer_name not in layer_with_weight:
+            quant_layer_name = 'QuantizedNoweightLayer'
+
+        return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs)
 
-    def save_quantized_model(self, layer, path, input_spec=None, **config):
-        self._out_scale.save_quantized_model(layer, path, input_spec, **config)
 
+class ImperativeQuantizeOutputs(object):
+    """
+    Calculate the output scales for some layers.
+    """
 
-class ImperativeCalcOutScale(object):
     def __init__(self, moving_rate=0.9):
         """
-        Add the logic of calculating and setting output quantization scales of some layers.
-        These output quantization scales may be used by tensorRT or some other inference engines.
+        The constructor for ImperativeQuantizeOutputs.
 
         Args:
-            moving_rate(float): The decay coefficient of moving average. The default value is 0.9.
+            moving_rate(float): The decay coefficient of moving average.
+                                The default value is 0.9.
         """
-        super(ImperativeCalcOutScale, self).__init__()
+        super(ImperativeQuantizeOutputs, self).__init__()
         self._moving_rate = moving_rate
-        self._out_scale_layer_type_list = (
-            BatchNorm, BatchNorm1D, BatchNorm2D, BatchNorm3D, Conv2D, LeakyReLU,
-            Linear, PReLU, Pool2D, MaxPool1D, MaxPool2D, ReLU, ReLU6, Sigmoid,
-            Softmax, Tanh, Swish)
-        self._register_hook_handle_list = []
-        self._out_scale_dict = collections.OrderedDict()
-
-    def calc_out_scale(self, model):
+
+    def apply(self, model):
         """
-        Insert the `moving_average_abs_max_scale` op to calculate output scale of Specific layers in model.
+        Insert the `moving_average_abs_max_scale` layers to calculate the
+        output scales for specific layers in the dygraph model.
 
         Args:
-            model(fluid.dygraph.Layer): The target model which would be calculate the output quantization scale.
+            model(fluid.dygraph.Layer): The target model which would be
+                calculate the output quantization scale.
 
         Returns:
             None
         """
-        assert isinstance(
-            model, dygraph.Layer), "model must be the instance of dygraph.Layer"
-        for _, layer in model.named_sublayers():
-            if not isinstance(layer, self._out_scale_layer_type_list):
-                if 'quantized_' not in layer.full_name():
-                    continue
-            forward_post_hook_handle = layer.register_forward_post_hook(
-                self._forward_post_hook)
-            self._register_hook_handle_list.append(forward_post_hook_handle)
+        assert isinstance(model, dygraph.Layer), \
+            "The model must be the instance of dygraph.Layer."
+
+        for name, layer in model.named_sublayers():
+            if not self._is_target_layer(layer):
+                continue
+
+            # TODO(jc): optimize this module
+            last_idx = 0
+            idx = 0
+            obj = model
+            while idx < len(name):
+                if (name[idx] == '.'):
+                    if hasattr(obj, name[last_idx:idx]):
+                        obj = getattr(obj, name[last_idx:idx])
+                        last_idx = idx + 1
+                idx += 1
+            target = name[last_idx:idx]
+
+            quant_layer = quant_nn.__dict__["QuantizedOutputLayer"](
+                layer, self._moving_rate)
+            setattr(obj, target, quant_layer)
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         """
@@ -361,162 +400,118 @@ class ImperativeCalcOutScale(object):
 
         Args:
             layer (Layer): The Layer to be saved.
-            path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
-            input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward 
-                method, which can be described by InputSpec or example Tensor. If None, all input variables of 
-                the original Layer's forward method would be the inputs of the saved model. Default None.
-            **configs (dict, optional): Other save configuration options for compatibility. We do not 
-                recommend using these configurations, they may be removed in the future. If not necessary, 
-                DO NOT use them. Default None.
+            path (str): The path prefix to save model. The format is 
+                ``dirname/file_prefix`` or ``file_prefix``.
+            input_spec (list[InputSpec|Tensor], optional): Describes the input
+                of the saved model's forward method, which can be described by
+                InputSpec or example Tensor. If None, all input variables of 
+                the original Layer's forward method would be the inputs of
+                the saved model. Default None.
+            **configs (dict, optional): Other save configuration options for
+                compatibility. We do not recommend using these configurations,
+                they may be removed in the future. If not necessary, DO NOT use
+                them. Default None.
                 The following options are currently supported:
-                (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
-                By default, all return variables of original Layer's forward method are kept as the 
-                output of the saved model. If the provided ``output_spec`` list is not all output variables, 
-                the saved model will be pruned according to the given ``output_spec`` list. 
+                (1) output_spec (list[Tensor]): Selects the output targets of
+                the saved model. By default, all return variables of original
+                Layer's forward method are kept as the output of the saved model.
+                If the provided ``output_spec`` list is not all output variables, 
+                the saved model will be pruned according to the given
+                ``output_spec`` list. 
 
         Returns:
             None
         """
+        assert isinstance(layer, dygraph.Layer), \
+            "The model must be the instance of dygraph.Layer."
 
-        assert isinstance(
-            layer, dygraph.Layer), "model must be the instance of dygraph.Layer"
-        is_dynamic_mode = False
-        with dygraph.guard():
-            layer.eval()
-            for handle in self._register_hook_handle_list:
-                handle.remove()
-            for key in self._out_scale_dict:
-                self._out_scale_dict[key] = float(self._out_scale_dict[key]
-                                                  .numpy())
+        paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)
 
+        is_dynamic_mode = False
         if paddle.in_dynamic_mode():
             is_dynamic_mode = True
             paddle.enable_static()
 
-        paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config)
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
+        place = core.CPUPlace()
+        scope = global_scope()
         exe = Executor(place)
 
-        file_prefix = os.path.basename(path)
         dirname = os.path.dirname(path)
-        model_filename = file_prefix + INFER_MODEL_SUFFIX
-        params_filename = file_prefix + INFER_PARAMS_SUFFIX
+        basename = os.path.basename(path)
+        model_filename = basename + INFER_MODEL_SUFFIX
+        params_filename = basename + INFER_PARAMS_SUFFIX
 
-        [inference_program, feed_target_names, fetch_targets] = (
+        [infer_program, feed_target_names, fetch_targets] = (
             load_inference_model(
                 dirname=dirname,
                 executor=exe,
                 model_filename=model_filename,
                 params_filename=params_filename))
 
-        # Traverse all ops in the program and find out the op matching
-        # the Layer in the dynamic graph.
-        layer_var_dict = collections.OrderedDict()
-        ops_list = [key for key, _ in self._out_scale_dict.items()]
-        op_count = 0
-        conv_count = 0
+        self._save_output_scale(infer_program, scope)
 
-        for block in inference_program.blocks:
-            for op in block.ops:
-                if op.type in _op_real_in_out_name:
-                    if op.type in ["batch_norm", "pool2d"]:
-                        if op.type == "pool2d" and op.attr(
-                                "pooling_type") != "max":
-                            continue
-                        op_count = self.op_match(op, ops_list, op_count)
-                        if op_count >= len(ops_list):
-                            continue
-                        op._set_attr('out_threshold',
-                                     self._out_scale_dict[ops_list[op_count]])
-                        op_count += 1
-                    else:
-                        output_var_names = quantization_pass._get_op_output_var_names(
-                            op)
-                        for output_var_name in output_var_names:
-                            output_var_tensor = block.var(output_var_name)
-                            if output_var_tensor.dtype not in [
-                                    core.VarDesc.VarType.FP64,
-                                    core.VarDesc.VarType.FP32
-                            ]:
-                                continue
-                            # Because the Layer in dygraph may correspond to multiple ops
-                            # in static program after being saved. To ensure correctness,
-                            # the outscale collected for output of dygraph Layer can only
-                            # be set to the last op in the corresponding ops in static program.
-                            #
-                            # We can judge the execution order of the ops which corresponding
-                            # to dygraph Layer by the name of output. And use dict to save
-                            # the corresponding relationship between the dygraph Layer and the
-                            # static graph op that needs to set the outscale attribute.
-                            if '.' not in output_var_name:
-                                continue
-                            dynamic_layer_name, var_name_suffix = output_var_name.split(
-                                ".")
-                            if dynamic_layer_name in layer_var_dict:
-                                if layer_var_dict[dynamic_layer_name][
-                                        0] < var_name_suffix:
-                                    layer_var_dict[dynamic_layer_name] = [
-                                        var_name_suffix, op
-                                    ]
-                            else:
-                                layer_var_dict[dynamic_layer_name] = [
-                                    var_name_suffix, op
-                                ]
-
-        # Because the naming styles of static and dynamic graph are different,
-        # in order to avoid mistakes, we unify the name here.
-        for (layer_name, var_name_op_list) in layer_var_dict.items():
-            if 'prelu' in layer_name:
-                layer_name = layer_name.replace('prelu', 'p_re_lu')
-            if 'relu' in layer_name:
-                layer_name = layer_name.replace('relu', 're_lu')
-            if 'conv2d' in layer_name:
-                layer_name = 'conv2d_' + str(conv_count)
-                conv_count = conv_count + 1
-            if layer_name not in self._out_scale_dict:
-                continue
-            var_name_op_list[1]._set_attr('out_threshold',
-                                          self._out_scale_dict[layer_name])
+        self._set_skip_quant_attr(infer_program)
 
-        # Save the processed program.
         save_inference_model(
             dirname=dirname,
             feeded_var_names=feed_target_names,
             target_vars=fetch_targets,
             executor=exe,
-            main_program=inference_program.clone(),
+            main_program=infer_program.clone(),
             model_filename=model_filename,
             params_filename=params_filename)
 
         if is_dynamic_mode:
             paddle.disable_static()
 
-    def op_match(self, op, ops_list, op_count):
-        while op_count < len(ops_list) and op.type not in ops_list[op_count]:
-            op_count += 1
-        while op_count < len(ops_list) and op.type is "pool2d" and op.attr(
-                "pooling_type") != "max":
-            op_count += 1
-        return op_count
-
-    def _forward_post_hook(self, layer, input, output):
-        assert isinstance(
-            output, (core.VarBase, framework.Variable)
-        ), "Multiple outputs are not currently supported in ImperativeOutScale."
-        if output.dtype not in [
-                core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64
-        ]:
-            return
-        if not hasattr(layer, "_out_scale"):
-            layer._out_scale = quant_nn.MovingAverageAbsMaxScale(
-                output.name, self._moving_rate, output.dtype)
-        scale_out = layer._out_scale(output)
-        if hasattr(layer, 'layer_name'):
-            layer_name = layer.layer_name
-        else:
-            layer_name = layer.full_name()
-        self._out_scale_dict[layer_name] = scale_out
+    def _is_target_layer(self, layer):
+        """
+        Whether the layer needs to calculate output scales.
+        """
+        return isinstance(layer, utils.quant_output_layers) \
+            or ('quantized' in layer.full_name() and \
+                'quantized_noweight' not in layer.full_name())
+
+    def _save_output_scale(self, program, scope):
+        """
+        Save all output scales to the corresponding ops in static
+        inference program and delete 'moving_average_abs_max_scale' ops.
+        """
+        for block in program.blocks:
+            for op in block.ops:
+                if op.type == "moving_average_abs_max_scale":
+                    in_var_name = op.input('X')[0]
+                    out_var_name = op.output('Out')[0]
+                    out_scale_name = op.output('OutScale')[0]
+
+                    out_scale = utils.load_variable_data(scope, out_scale_name)
+                    previous_op = utils.find_previous_op(block, in_var_name)
+                    previous_op._set_attr("out_threshold", float(out_scale))
+
+                    next_ops = utils.find_next_ops(block, out_var_name)
+                    for next_op in next_ops:
+                        next_op._rename_input(out_var_name, in_var_name)
+
+    def _set_skip_quant_attr(self, program):
+        """
+        Label the skip quantized ops.
+        """
+        for block in program.blocks:
+            for op in block.ops:
+                if self._is_skip_quant_op(block, op):
+                    op._set_attr("skip_quant", True)
+
+    def _is_skip_quant_op(self, block, in_op):
+        """
+        The input op should be skipped quantization.
+        1. the type of input op should be conv2d, depthwise_conv2d or matmul
+        2. the previous ops of the input op are not fake_quantize_dequantize ops
+        """
+        target_op_types = ["conv2d", "depthwise_conv2d", "matmul"]
+        if in_op.type not in target_op_types:
+            return False
+
+        previous_ops = [utils.find_previous_op(block, arg_name) \
+            for arg_name in in_op.input_arg_names]
+        return any(op is not None and op.type not in \
+            utils.fake_quantize_dequantize_types for op in previous_ops)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index 0469de7aef20704682e62e6d2af0f5f471113942..f6fef0689d43afd832aa8a5360fc7823575d8223 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -499,28 +499,31 @@ class QuantizedNoweightLayer(layers.Layer):
 
     def forward(self, input):
         quant_input = self._fake_quant_input(input)
+        # TODO (jc): support ops that have several inputs
+        if isinstance(input, list):
+            assert len(input) == 1, \
+                "The QuantizedNoweightLayer should only have one input."
         return self._layer.forward(quant_input)
 
 
 class MovingAverageAbsMaxScale(layers.Layer):
     def __init__(self, name=None, moving_rate=0.9, dtype='float32'):
         r"""
-        MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer.
-        Its computational formula is described as below:
+        MovingAverageMaxScale layer is used to calculating the output quantization
+        scale of Layer. Its computational formula is described as below:
 
         :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
         :math:`Out = X`
         """
         super(MovingAverageAbsMaxScale, self).__init__()
         self._moving_rate = moving_rate
-        self._dtype = dtype
 
         scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale'
-        name = unique_name.generate(scale_prefix)
+        scale_name = unique_name.generate(scale_prefix)
         scale_attr = ParamAttr(
-            name=name, initializer=Constant(1), trainable=False)
+            name=scale_name, initializer=Constant(1), trainable=False)
         self._scale = self.create_parameter(
-            shape=[1], attr=scale_attr, dtype=self._dtype)
+            shape=[1], attr=scale_attr, dtype=dtype)
         self._scale.stop_gradient = True
 
         state_prefix = "{}.state".format(name) if name else 'outscale.state'
@@ -529,7 +532,7 @@ class MovingAverageAbsMaxScale(layers.Layer):
             initializer=Constant(1),
             trainable=False)
         self._state = self.create_parameter(
-            shape=[1], attr=state_attr, dtype=self._dtype)
+            shape=[1], attr=state_attr, dtype=dtype)
         self._state.stop_gradient = True
 
         accum_prefix = "{}.accum".format(name) if name else 'outscale.accum'
@@ -538,9 +541,8 @@ class MovingAverageAbsMaxScale(layers.Layer):
             initializer=Constant(1),
             trainable=False)
         self._accum = self.create_parameter(
-            shape=[1], attr=accum_attr, dtype=self._dtype)
+            shape=[1], attr=accum_attr, dtype=dtype)
         self._accum.stop_gradient = True
-        MovingAverageAbsMaxScale._has_create = True
 
     def forward(self, input):
         if in_dygraph_mode():
@@ -548,19 +550,30 @@ class MovingAverageAbsMaxScale(layers.Layer):
                      not self.training)
             state = self._state if self.training else None
             accum = self._accum if self.training else None
+            quant_out = _varbase_creator(
+                type=input.type,
+                name="{}.tmp".format(input.name),
+                shape=input.shape,
+                dtype=input.dtype,
+                persistable=False)
 
-            out_scale, _, _ = core.ops.moving_average_abs_max_scale(
-                input, accum, state, self._scale, state, accum, *attrs)
-            return out_scale
+            out, _, _, _ = core.ops.moving_average_abs_max_scale(
+                input, accum, state, quant_out, self._scale, state, accum,
+                *attrs)
+            return out
 
         check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                                  'MovingAverageAbsMaxScale')
 
-        scale_out = self._scale
         attrs = {'moving_rate': self._moving_rate, 'is_test': not self.training}
-
         inputs = {"X": [input]}
-        outputs = {"OutScale": [scale_out]}
+        quant_out = self._helper.create_variable(
+            name="{}.tmp".format(input.name),
+            dtype=input.dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=False)
+        outputs = {"Out": [quant_out], "OutScale": [self._scale]}
 
         if self.training:
             inputs['InState'] = [self._state]
@@ -574,4 +587,22 @@ class MovingAverageAbsMaxScale(layers.Layer):
             outputs=outputs,
             attrs=attrs)
 
-        return scale_out
+        return quant_out
+
+
+class QuantizedOutputLayer(layers.Layer):
+    def __init__(self, layer=None, moving_rate=0.9, dtype='float32'):
+        r"""
+        Add MovingAverageMaxScale layer to the behind of the input layer.
+        """
+        super(QuantizedOutputLayer, self).__init__()
+        self._layer = layer
+        self._moving_average_abs_max_scale = \
+            MovingAverageAbsMaxScale(layer.full_name(), moving_rate, dtype)
+
+    def forward(self, input):
+        if isinstance(input, list):
+            assert len(input) == 1, \
+                "The QuantizedOutputLayer should only have one input."
+        out = self._layer(input)
+        return self._moving_average_abs_max_scale(out)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..004e1c1aa9bc501c4a91be5b821ce505592f6910
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+quant_input_layers_map = {
+    'Conv2D': paddle.nn.Conv2D,
+    'Linear': paddle.nn.Linear,
+    'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
+    'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D,
+    'AvgPool2D': paddle.nn.AvgPool2D,
+    'MaxPool2D': paddle.nn.MaxPool2D,
+    'Hardswish': paddle.nn.Hardswish,
+    'LeakyReLU': paddle.nn.LeakyReLU,
+    'PReLU': paddle.nn.PReLU,
+    'ReLU': paddle.nn.ReLU,
+    'ReLU6': paddle.nn.ReLU6,
+    'Sigmoid': paddle.nn.Sigmoid,
+    'Softmax': paddle.nn.Softmax,
+    'Swish': paddle.nn.Swish,
+    'Tanh': paddle.nn.Tanh,
+    'Hardswish': paddle.nn.Hardswish,
+    'BatchNorm': paddle.nn.BatchNorm,
+    'GroupNorm': paddle.nn.GroupNorm,
+    'LayerNorm': paddle.nn.LayerNorm,
+}
+
+fake_quantize_dequantize_types = [
+    "fake_quantize_dequantize_abs_max",
+    "fake_quantize_dequantize_channel_wise_abs_max",
+    "fake_quantize_dequantize_moving_average_abs_max"
+]
+
+quant_output_layers = (
+    paddle.nn.Conv2D, paddle.nn.Conv2DTranspose, paddle.nn.Linear,
+    paddle.nn.AdaptiveAvgPool2D, paddle.nn.AdaptiveMaxPool2D,
+    paddle.nn.AvgPool2D, paddle.nn.MaxPool2D, paddle.nn.BatchNorm,
+    paddle.nn.BatchNorm2D, paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm,
+    paddle.nn.ELU, paddle.nn.GELU, paddle.nn.Hardshrink, paddle.nn.Hardsigmoid,
+    paddle.nn.Hardswish, paddle.nn.Hardtanh, paddle.nn.LeakyReLU,
+    paddle.nn.LogSigmoid, paddle.nn.LogSoftmax, paddle.nn.Maxout,
+    paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6, paddle.nn.SELU,
+    paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Softplus,
+    paddle.nn.Softshrink, paddle.nn.Softsign, paddle.nn.Swish, paddle.nn.Tanh,
+    paddle.nn.Tanhshrink, paddle.nn.ThresholdedReLU, paddle.nn.Upsample)
+
+weight_op_types = [
+    "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose",
+    "depthwise_conv2d_transpose"
+]
+
+
+def load_variable_data(scope, var_name):
+    '''
+    Load variable value from scope
+    '''
+    var_node = scope.find_var(var_name)
+    assert var_node is not None, \
+        "Can not find " + var_name + " in the scope."
+    return np.array(var_node.get_tensor())
+
+
+def find_previous_op(block, var_name):
+    """
+    Find the previous op for the input variable.
+    """
+    for op in block.ops:
+        if var_name in op.output_arg_names:
+            return op
+
+
+def find_next_ops(block, var_name):
+    """
+    Find all followed ops for the input variable.
+    """
+    res_ops = []
+    for op in block.ops:
+        if var_name in op.input_arg_names:
+            res_ops.append(op)
+    return res_ops
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index b59534b5965adf96233d461f24854a96d643a27f..bc2e2dc9b6562c96043db7dad3e657f2a6e8f25f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -16,9 +16,11 @@ import os
 import re
 import logging
 import numpy as np
+import shutil
 from .... import io
 from .... import core
 from .... import framework
+from .... import unique_name
 from ....executor import global_scope, Executor
 from ....framework import IrGraph
 from ....log_helper import get_logger
@@ -53,7 +55,7 @@ def _set_variable_data(scope, place, var_name, np_value):
     Set the value of var node by name, if the node exits,
     '''
     assert isinstance(np_value, np.ndarray), \
-        'The type of value should be numpy array.'
+       'The type of value should be numpy array.'
     var_node = scope.find_var(var_name)
     if var_node != None:
         tensor = var_node.get_tensor()
@@ -136,8 +138,10 @@ class PostTrainingQuantization(object):
                  batch_size=10,
                  batch_nums=None,
                  algo="KL",
+                 hist_percent=0.99999,
                  quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"],
                  is_full_quantize=False,
+                 bias_correction=False,
                  activation_bits=8,
                  weight_bits=8,
                  activation_quantize_type='range_abs_max',
@@ -178,7 +182,13 @@ class PostTrainingQuantization(object):
                 get the KL threshold for quantized activations and get the abs_max
                 value for quantized weights. If algo='abs_max', get the abs max 
                 value for activations and weights. If algo= 'min_max', get the min 
-                and max value for quantized activations and weights. Default is KL.
+                and max value for quantized activations and weights. If algo='avg',
+                get the average value among the max values for activations. If 
+                algo= 'hist', get the value of 'hist_percent' quantile as the threshold.
+                If algo='mse', get the value which makes the quantization mse loss 
+                minimal. Default is KL.
+            hist_percent(float, optional): The threshold of algo 'hist' for activations.
+                Default is 0.99999.
             quantizable_op_type(list[str], optional): List the type of ops 
                 that will be quantized. Default is ["conv2d", "depthwise_conv2d", 
                 "mul"].
@@ -186,6 +196,8 @@ class PostTrainingQuantization(object):
                 apply quantization to all supported quantizable op type. If set
                 is_full_quantized as False, only apply quantization to the op type 
                 according to the input quantizable_op_type.
+            bias_correction(bool, optional): If set as True, use the bias correction
+                method of https://arxiv.org/abs/1810.05723. Default is False.
             activation_bits(int): quantization bit number for activation.
             weight_bits(int, optional): quantization bit number for weights.
             activation_quantize_type(str): quantization type for activation,
@@ -253,7 +265,9 @@ class PostTrainingQuantization(object):
             'range_abs_max', 'moving_average_abs_max', 'abs_max'
         ]
         self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max']
-        self._support_algo_type = ['KL', 'abs_max', 'min_max']
+        self._support_algo_type = [
+            'KL', 'hist', 'avg', 'mse', 'abs_max', 'min_max'
+        ]
         self._dynamic_quantize_op_type = ['lstm']
         self._support_quantize_op_type = \
             list(set(QuantizationTransformPass._supported_quantizable_op_type +
@@ -268,7 +282,7 @@ class PostTrainingQuantization(object):
             "cannot be None in the same time."
         assert batch_size > 0, "The batch_size should be greater than 0."
         assert algo in self._support_algo_type, \
-            "The algo should be KL, abs_max or min_max."
+            "The algo should be KL, hist, mse, avg, abs_max or min_max."
         assert activation_quantize_type in self._support_activation_quantize_type, \
             "The activation_quantize_type ({}) should in ({}).".format(
             activation_quantize_type, self._support_activation_quantize_type)
@@ -277,6 +291,7 @@ class PostTrainingQuantization(object):
             weight_quantize_type, self._support_weight_quantize_type)
 
         # Save input params
+        self._bias_correction = bias_correction
         self._executor = executor
         self._scope = global_scope() if scope == None else scope
         self._model_dir = model_dir
@@ -287,6 +302,7 @@ class PostTrainingQuantization(object):
         self._batch_size = batch_size
         self._batch_nums = batch_nums
         self._algo = algo
+        self._hist_percent = hist_percent
         self._activation_bits = activation_bits
         self._weight_bits = weight_bits
         self._activation_quantize_type = activation_quantize_type
@@ -312,17 +328,21 @@ class PostTrainingQuantization(object):
         self._quantized_weight_var_name = set()
         self._quantized_act_var_name = set()
         self._weight_op_pairs = {}
-        # The vars for alog = KL
+        # The vars for alog = KL or hist
         self._sampling_act_abs_min_max = {}
         self._sampling_act_histogram = {}
         self._sampling_data = {}
-        self._quantized_var_kl_threshold = {}
+        self._quantized_var_threshold = {}
         self._histogram_bins = 2048
         # The vars for algo = min_max
         self._quantized_var_min = {}
         self._quantized_var_max = {}
-        # The vars for algo = abs_max
-        self._quantized_var_abs_max = {}
+        # The vars for algo = avg
+        self._quantized_var_avg = {}
+        # The best loss of algo = mse
+        self._best_mse_loss = {}
+        # The threshold for algo = abs_max, mse or avg
+        self._quantized_threshold = {}
 
     def quantize(self):
         '''
@@ -339,7 +359,7 @@ class PostTrainingQuantization(object):
         self._collect_target_varnames()
         self._set_activation_persistable()
 
-        if self._algo == "KL":
+        if self._algo in ["KL", "hist"]:
             _logger.info("Preparation stage ...")
             batch_id = 0
             for data in self._data_loader():
@@ -372,13 +392,14 @@ class PostTrainingQuantization(object):
             if self._batch_nums and batch_id >= self._batch_nums:
                 break
         _logger.info("Finish sampling stage, all batch: " + str(batch_id))
-
         self._reset_activation_persistable()
-
-        if self._algo == "KL":
-            self._calculate_kl_threshold()
-
-        if self._algo in ["KL", "abs_max"]:
+        if self._algo == 'avg':
+            for var_name in self._quantized_act_var_name:
+                self._quantized_threshold[var_name] = \
+                np.array(self._quantized_var_avg[var_name]).mean()
+        if self._algo in ["KL", "hist"]:
+            self._calculate_kl_hist_threshold()
+        if self._algo in ["KL", "abs_max", "hist", "avg", "mse"]:
             self._update_program()
         else:
             self._save_input_threhold()
@@ -524,14 +545,84 @@ class PostTrainingQuantization(object):
         '''
         if self._algo == "abs_max":
             self._sample_abs_max()
+        elif self._algo == "avg":
+            self._sample_avg()
         elif self._algo == "min_max":
             self._sample_min_max()
-        elif self._algo == "KL":
+        elif self._algo == "mse":
+            self._sample_mse()
+        elif self._algo in ["KL", "hist"]:
             self._sample_histogram()
 
+    def _sample_mse(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+        _logger.info("MSE searching stage ...")
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            var_tensor = var_tensor.flatten()
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            s = 0.3
+            if var_name not in self._best_mse_loss:
+                self._best_mse_loss[var_name] = float('inf')
+            while s <= 1.0:
+                scale = s * abs_max_value
+                s += 0.02
+                bins = 2**(self._activation_bits - 1) - 1
+                quant_dequant_var = np.round(
+                    np.clip(var_tensor, 0.0, scale) / scale *
+                    bins) / bins * scale
+                mse_loss = ((var_tensor - quant_dequant_var)**2).mean()
+                if mse_loss <= self._best_mse_loss[var_name]:
+                    self._best_mse_loss[var_name] = mse_loss
+                    self._quantized_threshold[var_name] = scale
+
+    def _sample_avg(self):
+        if self._quantized_threshold == {}:
+            for var_name in self._quantized_weight_var_name:
+                var_tensor = _load_variable_data(self._scope, var_name)
+                if self._weight_quantize_type == "abs_max":
+                    abs_max_value = float(np.max(np.abs(var_tensor)))
+                elif self._weight_quantize_type == "channel_wise_abs_max":
+                    abs_max_value = []
+                    if self._weight_op_pairs[
+                            var_name] in _channelwise_quant_axis1_ops:
+                        for i in range(var_tensor.shape[1]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[:, i]))))
+                    else:
+                        for i in range(var_tensor.shape[0]):
+                            abs_max_value.append(
+                                float(np.max(np.abs(var_tensor[i]))))
+                self._quantized_threshold[var_name] = abs_max_value
+
+        for var_name in self._quantized_act_var_name:
+            var_tensor = _load_variable_data(self._scope, var_name)
+            abs_max_value = float(np.max(np.abs(var_tensor)))
+            if (var_name not in self._quantized_var_avg):
+                self._quantized_var_avg[var_name] = []
+            abs_avg_value = float(np.mean(np.max(  \
+            np.abs(var_tensor.reshape(var_tensor.shape[0], -1)), axis=(1))))
+            self._quantized_var_avg[var_name].append(abs_avg_value)
+            continue
+
     def _sample_abs_max(self):
-        # Only calculate abs_max value for weight for once
-        if self._quantized_var_abs_max == {}:
+        if self._quantized_threshold == {}:
             for var_name in self._quantized_weight_var_name:
                 var_tensor = _load_variable_data(self._scope, var_name)
                 if self._weight_quantize_type == "abs_max":
@@ -547,14 +638,14 @@ class PostTrainingQuantization(object):
                         for i in range(var_tensor.shape[0]):
                             abs_max_value.append(
                                 float(np.max(np.abs(var_tensor[i]))))
-                self._quantized_var_abs_max[var_name] = abs_max_value
+                self._quantized_threshold[var_name] = abs_max_value
 
         for var_name in self._quantized_act_var_name:
             var_tensor = _load_variable_data(self._scope, var_name)
             abs_max_value = float(np.max(np.abs(var_tensor)))
-            if (var_name not in self._quantized_var_abs_max) or \
-                (abs_max_value > self._quantized_var_abs_max[var_name]):
-                self._quantized_var_abs_max[var_name] = abs_max_value
+            if (var_name not in self._quantized_threshold) or \
+                (abs_max_value > self._quantized_threshold[var_name]):
+                self._quantized_threshold[var_name] = abs_max_value
 
     def _sample_min_max(self):
         if self._quantized_var_min == {} and self._quantized_var_max == {}:
@@ -644,12 +735,12 @@ class PostTrainingQuantization(object):
                     [], bins=self._histogram_bins, range=(min_val, max_val))
                 self._sampling_act_histogram[var_name] = [hist, hist_edeges]
 
-    def _calculate_kl_threshold(self):
+    def _calculate_kl_hist_threshold(self):
         '''
-        Calculate the KL threshold of quantized variables.
+        Calculate the KL or hist threshold of quantized variables.
         '''
-        _logger.info("Calculate KL threshold ...")
-        assert self._algo == "KL", "The algo should be KL to calculate kl threshold."
+        _logger.info("Calculate {} threshold ...".format(self._algo))
+        assert self._algo in ["KL", "hist"], "The algo should be KL or hist."
 
         # Abs_max threshold for weights
         for var_name in self._quantized_weight_var_name:
@@ -667,18 +758,22 @@ class PostTrainingQuantization(object):
                     for i in range(weight_data.shape[0]):
                         weight_threshold.append(
                             float(np.max(np.abs(weight_data[i]))))
-            self._quantized_var_kl_threshold[var_name] = weight_threshold
+            self._quantized_var_threshold[var_name] = weight_threshold
 
         for var_name in self._quantized_act_var_name:
             hist, hist_edeges = self._sampling_act_histogram[var_name]
-            self._quantized_var_kl_threshold[var_name] = \
-                self._get_kl_scaling_factor(hist, hist_edeges)
+            if self._algo == "KL":
+                self._quantized_var_threshold[var_name] = \
+                    self._get_kl_scaling_factor(hist, hist_edeges)
+            elif self._algo == "hist":
+                self._quantized_var_threshold[var_name] = \
+                    self._get_hist_scaling_factor(hist, hist_edeges)
 
     def _update_program(self):
         '''
         Use QuantizationTransformPass and AddQuantDequantPass to insert 
         fake_quantize, fake_dequantize and fake_quant_dequant op. 
-        Besides, save all kl threshold to the scale var node.
+        Besides, save all threshold to the scale var node.
         '''
         _logger.info("Update the program ...")
         graph = IrGraph(core.Graph(self._program.desc), for_test=True)
@@ -709,11 +804,11 @@ class PostTrainingQuantization(object):
             quantizable_op_type=minor_quantizable_op_types)
         add_quant_dequant_pass.apply(graph)
 
-        # save abs_max or KL threshold to scale var node
-        if self._algo == "KL":
-            scale_dict = self._quantized_var_kl_threshold
+        # save threshold to scale var node
+        if self._algo in ["KL", "hist"]:
+            scale_dict = self._quantized_var_threshold
         else:
-            scale_dict = self._quantized_var_abs_max
+            scale_dict = self._quantized_threshold
         for key, val in scale_dict.items():
             _set_variable_data(
                 self._scope,
@@ -732,6 +827,7 @@ class PostTrainingQuantization(object):
         freeze_pass = QuantizationFreezePass(
             scope=self._scope,
             place=self._place,
+            bias_correction=self._bias_correction,
             weight_bits=self._weight_bits,
             activation_bits=self._activation_bits,
             weight_quantize_type=self._weight_quantize_type,
@@ -759,20 +855,28 @@ class PostTrainingQuantization(object):
                 out_var_name + " is not the output of the op"
             if self._algo == "KL":
                 # For compatibility, we save output threshold by two methods.
-                save_info(op_node, out_var_name,
-                          self._quantized_var_kl_threshold, "out_threshold",
-                          "post_kl")
+                save_info(op_node, out_var_name, self._quantized_var_threshold,
+                          "out_threshold", "post_kl")
                 save_info(
-                    op_node, out_var_name, self._quantized_var_kl_threshold,
+                    op_node, out_var_name, self._quantized_var_threshold,
                     argname_index[0] + str(argname_index[1]) + "_threshold",
                     "post_kl")
-            elif self._algo == "abs_max":
-                save_info(op_node, out_var_name, self._quantized_var_abs_max,
-                          "out_threshold", "post_abs_max")
+            elif self._algo == "hist":
+                # For compatibility, we save output threshold by two methods.
+                save_info(op_node, out_var_name, self._quantized_var_threshold,
+                          "out_threshold", "post_hist")
                 save_info(
-                    op_node, out_var_name, self._quantized_var_abs_max,
+                    op_node, out_var_name, self._quantized_var_threshold,
                     argname_index[0] + str(argname_index[1]) + "_threshold",
-                    "post_kl")
+                    "post_hist")
+
+            elif self._algo in ["avg", "abs_max", "mse"]:
+                save_info(op_node, out_var_name, self._quantized_threshold,
+                          "out_threshold", "post_" + str(self._algo))
+                save_info(
+                    op_node, out_var_name, self._quantized_threshold,
+                    argname_index[0] + str(argname_index[1]) + "_threshold",
+                    "post_" + str(self._algo))
             elif self._algo == "min_max":
                 save_info(op_node, out_var_name, self._quantized_var_min,
                           "out_min", "post_min_max")
@@ -815,10 +919,27 @@ class PostTrainingQuantization(object):
                     op._set_attr("quantization_type", quantization_type)
                     op._set_attr("bit_length", self._weight_bits)
 
-    def _get_kl_scaling_factor(self, hist, hist_edeges, num_quantized_bins=255):
+    def _get_hist_scaling_factor(self, hist, hist_edges):
+        '''
+        Using the hist method to get the scaling factor.
+        '''
+        threshold_rate = self._hist_percent
+        hist = hist / float(sum(hist))
+        hist_sum = 0
+        hist_index = 0
+        for i in range(len(hist)):
+            hist_sum += hist[i]
+            if hist_sum >= threshold_rate:
+                hist_index = i + 1
+                break
+        bin_width = hist_edges[1] - hist_edges[0]
+        return (hist_index - 0.5) * bin_width
+
+    def _get_kl_scaling_factor(self, hist, hist_edeges):
         '''
         Using the KL-divergenc method to get the more precise scaling factor.
         '''
+        num_quantized_bins = 2**(self._activation_bits - 1) - 1
         ending_iter = self._histogram_bins - 1
         starting_iter = int(ending_iter * 0.7)
         bin_width = hist_edeges[1] - hist_edeges[0]
@@ -1006,6 +1127,82 @@ class WeightQuantization(object):
                 quantizable_op_type, weight_bits, weight_quantize_type, True,
                 threshold_rate)
 
+    def convert_weight_to_fp16(self, save_model_dir):
+        """
+        Convert all presistable vars from fp32 to fp16.
+        Note that, this api only changes the data type of variables in
+        __params__ file, and the __model__ file remains unchanged. 
+
+        Args:
+            save_model_dir(str): The path to save the fp16 model.
+        """
+
+        # Load model
+        place = core.CPUPlace()
+        exe = Executor(place)
+        scope = global_scope()
+        [infer_program, feed_list, fetch_list] = \
+            io.load_inference_model(dirname=self._model_dir,
+                                    executor=exe,
+                                    model_filename=self._model_filename,
+                                    params_filename=self._params_filename)
+
+        # Clone and save fp16 weights
+        save_program = framework.Program()
+        save_block = save_program.global_block()
+        save_var_map = {}
+
+        for var in infer_program.list_vars():
+            if (var.type == core.VarDesc.VarType.RAW) or \
+                (not var.persistable) or (var.name in ['feed', 'fetch']) \
+                or (var.dtype != core.VarDesc.VarType.FP32):
+                continue
+
+            #new_var = _clone_var_to_block_(var, save_block)
+            new_var = save_block._clone_variable(var)
+            if self._params_filename is not None:
+                save_var_map[new_var.name] = new_var
+            else:
+                save_file_path = os.path.join(
+                    os.path.normpath(save_model_dir), new_var.name)
+                save_block.append_op(
+                    type='save',
+                    inputs={'X': [new_var]},
+                    outputs={},
+                    attrs={
+                        'file_path': os.path.normpath(save_file_path),
+                        'save_as_fp16': True
+                    })
+
+        if self._params_filename is not None:
+            save_var_list = []
+            for name in sorted(save_var_map.keys()):
+                save_var_list.append(save_var_map[name])
+
+            saved_params_var = save_block.create_var(
+                type=core.VarDesc.VarType.RAW,
+                name=unique_name.generate("saved_params"))
+            saved_params_var.desc.set_persistable(True)
+
+            save_path = os.path.join(
+                os.path.normpath(save_model_dir), self._params_filename)
+            save_block.append_op(
+                type='save_combine',
+                inputs={'X': save_var_list},
+                outputs={'Y': saved_params_var},
+                attrs={'file_path': save_path,
+                       'save_as_fp16': True})
+
+        save_program._sync_with_cpp()
+        exe.run(save_program)
+
+        # Copy model
+        model_filename = "__model__" if self._model_filename is None \
+                    else self._model_filename
+        src_model = os.path.join(self._model_dir, model_filename)
+        dest_model = os.path.join(save_model_dir, model_filename)
+        shutil.copyfile(src_model, dest_model)
+
     def _quantize_weight_to_int(self, save_model_dir, save_model_filename,
                                 save_params_filename, quantizable_op_type,
                                 weight_bits, weight_quantize_type, for_test,
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index d93a2059bdcf080c5bbd5546469f68f800c583f9..68cc8106c9c073e32c21ec30d56657f373ebcf0f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -62,9 +62,8 @@ class Quant2Int8MkldnnPass(object):
         self._ops_to_quantize = _ops_to_quantize
         self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set(
             [-1])
-        self._scale_immutable_ops = [
-            'transpose2', 'reshape2', 'pool2d', 'scale'
-        ]
+        self._scale_immutable_ops = ['transpose2', 'reshape2', 'pool2d']
+        self._scale_ops = ['scale']
         self._conv_ops = ['conv2d', 'depthwise_conv2d']
         self._pool_ops = ['pool2d']
         self._mul_ops = ['mul']
@@ -87,8 +86,8 @@ class Quant2Int8MkldnnPass(object):
         self._reset_pass_idx_and_group('int8')
         graph = self._label_skip_quantized_op(graph)
         graph = self._gather_weight_thresholds_from_fake(graph)
-        graph = self._gather_output_scales_from_attr(graph)
         graph = self._gather_input_scales_from_fake(graph)
+        graph = self._gather_output_scales_from_attr(graph)
         graph = self._remove_fake_ops(graph)
         graph = self._dequantize_weights(graph)
         graph = self._optimize_fp32_graph(graph)
@@ -160,12 +159,16 @@ class Quant2Int8MkldnnPass(object):
                     op_node.op()._set_attr("skip_quant", True)
         return graph
 
-    def _gather_input_scales_from_fake(self, graph):
-        def _add_scale_for_vars(var_names, use_unsigned_int, lod_tensor):
-            scales = self._var_quant_scales
-            for var_name in var_names:
+    def _add_scale_for_vars(self, var_names, use_unsigned_int, lod_tensor):
+        """
+        Save quantization scales for variables. Do not overwrite.
+        """
+        scales = self._var_quant_scales
+        for var_name in var_names:
+            if var_name not in scales:
                 scales[var_name] = (use_unsigned_int, lod_tensor)
 
+    def _gather_input_scales_from_fake(self, graph):
         # fake_quantize_dequantize_abs_max doesn't have scale value
         fake_ops = ['fake_quantize_dequantize_moving_average_abs_max']
         fake_ops.extend(self._fake_quantize_types)
@@ -185,8 +188,8 @@ class Quant2Int8MkldnnPass(object):
                 scale[scale == np.Inf] = 0.0
                 lod_tensor = self._convert_scale2tensor(scale)
                 use_unsigned_int = False
-                _add_scale_for_vars([input_name, output_name], use_unsigned_int,
-                                    lod_tensor)
+                self._add_scale_for_vars([input_name, output_name],
+                                         use_unsigned_int, lod_tensor)
 
         return graph
 
@@ -219,8 +222,8 @@ class Quant2Int8MkldnnPass(object):
                 use_unsigned_int = False
                 for output_name in op.op().outputs():
                     for out_var_name in op.op().output(output_name):
-                        self._var_quant_scales[out_var_name] = (
-                            use_unsigned_int, scale_lod_tensor)
+                        self._add_scale_for_vars(
+                            [out_var_name], use_unsigned_int, scale_lod_tensor)
 
         return graph
 
@@ -239,24 +242,21 @@ class Quant2Int8MkldnnPass(object):
                     output_name = op.output("Out")[0]
                     tensor_names = [input_name, output_name]
 
-                    # Scale is not quantized, so if it doesn't have any scales
-                    # to propagate, its tensors won't be added to the waiting list.
-                    if all(name not in self._var_quant_scales for name in tensor_names) \
-                            and op.name() != 'scale':
+                    if all(name not in self._var_quant_scales
+                           for name in tensor_names):
                         waiting_for_scale.update(tensor_names)
                         continue
-
-                    if input_name in self._var_quant_scales:
+                    elif input_name in self._var_quant_scales:
                         self._var_quant_scales[
                             output_name] = self._var_quant_scales[input_name]
                     elif output_name in self._var_quant_scales:
-                        if op.name() == 'scale':
-                            _update_scale_op_in_scale(op, input_name,
-                                                      output_name)
-                        else:
-                            self._var_quant_scales[
-                                input_name] = self._var_quant_scales[
-                                    output_name]
+                        self._var_quant_scales[
+                            input_name] = self._var_quant_scales[output_name]
+                elif op.name() in self._scale_ops:
+                    input_name = op.input("X")[0]
+                    output_name = op.output("Out")[0]
+                    if output_name in self._var_quant_scales:
+                        _update_scale_op_in_scale(op, input_name, output_name)
             return waiting_for_scale
 
         waiting_for_scale = _update_scales(graph)
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 3f9ff7295dd6bb45cf7645766ac190b0126eb6a3..ec215a3e5757ea58b06ba0c4cc8d8c6881cd8e7a 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -60,6 +60,7 @@ _out_scale_op_list = [
     "swish",
     "softmax",
     "batch_norm",
+    "layer_norm",
     "elementwise_add",
     "pool2d",
     "reshape2",
@@ -67,6 +68,7 @@ _out_scale_op_list = [
     "concat",
     "elementwise_mul",
     "scale",
+    "slice",
     "hard_swish",
     "hard_sigmoid",
     "conv2d_transpose",
@@ -119,6 +121,7 @@ _op_real_in_out_name = {
     "swish": [["X"], ["Out"]],
     "dropout": [["X"], ["Out"]],
     "batch_norm": [["X"], ["Y"]],
+    "layer_norm": [["X"], ["Y"]],
     "sigmoid": [["X"], ["Out"]],
     "elementwise_mul": [["X", "Y"], ["Out"]],
     "scale": [["X"], ["Out"]],
@@ -1070,6 +1073,7 @@ class QuantizationFreezePass(object):
     def __init__(self,
                  scope,
                  place,
+                 bias_correction=False,
                  weight_bits=8,
                  activation_bits=8,
                  weight_quantize_type='abs_max',
@@ -1085,6 +1089,8 @@ class QuantizationFreezePass(object):
             scope(fluid.Scope): scope is used to get the weight tensor values.
             place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to restore the weight tensors.
                 If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
+            bias_correction(bool): whether use bias correction for post-training quantization.
+                 https://arxiv.org/abs/1810.05723.
             weight_bits(int): quantization bit number for weights.
             activation_bits(int): quantization bit number for activation.
             weight_quantize_type(str): quantization type for weights, support 'abs_max' and 
@@ -1098,6 +1104,7 @@ class QuantizationFreezePass(object):
         assert place is not None, \
             'The place cannot be set None.'
         self._scope = scope
+        self._bias_correction = bias_correction
         self._place = _get_paddle_place(place)
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
@@ -1154,7 +1161,10 @@ class QuantizationFreezePass(object):
                     else:
                         quant_axis = 0
                     quantized_param_v = self._quant(
-                        param_v, scale_v, self._weight_bits, quant_axis)
+                        param_v.copy(), scale_v, self._weight_bits, quant_axis)
+                    if self._bias_correction == True:
+                        quantized_param_v = self._bias_correction_w(
+                            param_v, quantized_param_v, scale_v, quant_axis)
                     self._restore_var(input_arg_name, quantized_param_v)
                     self._remove_fake_quant_and_dequant_op(graph, op_node)
 
@@ -1373,6 +1383,8 @@ class QuantizationFreezePass(object):
 
         if isinstance(scale, list):
             for i, s in enumerate(scale):
+                if s == 0.0:
+                    s = 1e-8
                 if quant_axis == 0:
                     x[i] = _clip(x[i], s)
                     x[i] = np.round(x[i] / s * bnt)
@@ -1384,6 +1396,46 @@ class QuantizationFreezePass(object):
             x = np.round(x / scale * bnt)
         return x
 
+    def _bias_correction_w(self, x, x_quant, scale_v, quant_axis):
+        '''
+        Bias correction for weight
+        '''
+        eps = 1e-8
+        bnt = (1 << (self._weight_bits - 1)) - 1
+        x_dequant = x_quant.copy()
+        if isinstance(scale_v, list):
+            if quant_axis == 0:
+                for i, s in enumerate(scale_v):
+                    x_dequant[i] = x_dequant[i] * s / bnt
+                quant_bias = x - x_dequant
+                mean_bias = quant_bias.reshape(quant_bias.shape[0], -1).mean(-1)
+                std_orig = x.reshape(x.shape[0], -1).std(-1)
+                std_quant = x_dequant.reshape(x_dequant.shape[0], -1).std(-1)
+                std_bias = std_orig / (std_quant + eps)
+            else:
+                for i, s in enumerate(scale_v):
+                    x_dequant[:, i] = x_quant[:, i] * s / bnt
+                quant_bias = x - x_dequant
+                mean_bias = np.array([
+                    quant_bias[:, i].mean() for i in range(quant_bias.shape[1])
+                ])
+                std_orig = np.array([x[:, i].std() for i in range(x.shape[1])])
+                std_quant = np.array(
+                    [x_dequant[:, i].std() for i in range(x_dequant.shape[1])])
+                std_bias = std_orig / (std_quant + eps)
+        else:
+            x_dequant = x_quant * scale_v / bnt
+            mean_bias = (x - x_dequant).mean()
+            std_bias = x.std() / (x_dequant.std() + eps)
+        if mean_bias.ndim == 1:
+            std_bias = np.resize(std_bias, x.shape)
+            mean_bias = np.resize(mean_bias, x.shape)
+
+        x_dequant = (mean_bias + x_dequant) * std_bias
+        quantized_param_v = self._quant(x_dequant, scale_v, self._weight_bits,
+                                        quant_axis)
+        return quantized_param_v
+
 
 class ConvertToInt8Pass(object):
     def __init__(self, scope, place, quantizable_op_type=None):
@@ -1700,7 +1752,7 @@ class AddQuantDequantPass(object):
         "bilinear_interp", "nearest_interp", "trilinear_interp", "slice",
         "squeeze", "elementwise_sub", "mul", "matmul", "relu", "relu6",
         "leaky_relu", "tanh", "swish", "scale", "transpose", "transpose2",
-        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm"
+        "sigmoid", "pad2d", "flatten", "flatten2", "batch_norm", "layer_norm"
     ]
 
     # To be compatible with PaddleSlim, not remove _activation_type for now
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index 47e21910b48dfd4a367ea744de2ffbfaf07b9df2..8d6ce76ef0fa5f3d1b1e9400c705ffc625fcf9bb 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -19,6 +19,8 @@ import numpy as np
 import random
 import unittest
 import logging
+import warnings
+
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
@@ -29,9 +31,8 @@ from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass, OutScaleForInferencePass, QuantizationTransformPass
 from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6
+from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU
 from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
-from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph import nn
 
@@ -45,6 +46,14 @@ _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
+def get_vaild_warning_num(warning, w):
+    num = 0
+    for i in range(len(w)):
+        if warning in str(w[i].message):
+            num += 1
+    return num
+
+
 def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
     conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
     conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
@@ -76,9 +85,9 @@ def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
         param_attr=conv2d_w2_attr,
         bias_attr=conv2d_b2_attr)
     batch_norm2 = layers.batch_norm(conv2)
-    relu6_1 = layers.relu6(batch_norm2)
+    prelu1 = layers.prelu(batch_norm2, mode='all')
     pool2 = fluid.layers.pool2d(
-        relu6_1, pool_size=2, pool_type='max', pool_stride=2)
+        prelu1, pool_size=2, pool_type='max', pool_stride=2)
 
     fc1 = fluid.layers.fc(input=pool2,
                           size=120,
@@ -121,8 +130,8 @@ class ImperativeLenet(fluid.dygraph.Layer):
                 bias_attr=False),
             BatchNorm2D(6),
             ReLU(),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
+            MaxPool2D(
+                kernel_size=2, stride=2),
             Conv2D(
                 in_channels=6,
                 out_channels=16,
@@ -132,7 +141,7 @@ class ImperativeLenet(fluid.dygraph.Layer):
                 weight_attr=conv2d_w2_attr,
                 bias_attr=conv2d_b2_attr),
             BatchNorm2D(16),
-            ReLU6(),
+            PReLU(),
             MaxPool2D(
                 kernel_size=2, stride=2))
 
@@ -246,6 +255,10 @@ class TestImperativeOutSclae(unittest.TestCase):
 
             lenet.eval()
 
+        param_save_path = "test_save_quantized_model/lenet.pdparams"
+        save_dict = lenet.state_dict()
+        paddle.save(save_dict, param_save_path)
+
         path = "./dynamic_outscale_infer_model/lenet"
         dynamic_save_dir = "./dynamic_outscale_infer_model"
 
@@ -285,6 +298,8 @@ class TestImperativeOutSclae(unittest.TestCase):
         for param in main.all_parameters():
             if "batch_norm" in param.name:
                 param_name = param.name.replace("norm", "norm2d")
+            elif 'prelu' in param.name:
+                param_name = param.name.replace("prelu", 'p_re_lu')
             else:
                 param_name = param.name
             param_tensor = scope.var(param.name).get_tensor()
@@ -341,7 +356,6 @@ class TestImperativeOutSclae(unittest.TestCase):
                     "diff({}) at {}, dynamic loss = {}, static loss = {}".
                     format(diff, i, loss_d, loss_s))
                 break
-
         self.assertTrue(
             np.allclose(
                 np.array(dynamic_loss_rec),
@@ -377,12 +391,92 @@ class TestImperativeOutSclae(unittest.TestCase):
             if 'fake' in op.type:
                 static_ops.remove(op)
 
+        op_count = 0
+        for i in range(len(dynamic_ops)):
+            if dynamic_ops[i].has_attr("out_threshold"):
+                op_count += 1
+                self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
+                if dynamic_ops[i].attr("out_threshold") != static_ops[i].attr(
+                        "out_threshold"):
+                    _logger.info(dynamic_ops[i].attr("out_threshold"))
+                    _logger.info(static_ops[i].attr("out_threshold"))
+                self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
+                                static_ops[i].attr("out_threshold"))
+
+        _logger.info("op_cout: {}".format(op_count))
+        self.assertTrue(op_count == 14)
+
+
+class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
+    def test_save_quantized_model(self):
+        weight_quantize_type = 'abs_max'
+        activation_quantize_type = 'moving_average_abs_max'
+        load_param_path = "test_save_quantized_model/lenet.pdparams"
+        path = "./dynamic_outscale_infer_model_from_checkpoint/lenet"
+        dynamic_model_save_dir = "./dynamic_outscale_infer_model_from_checkpoint"
+        static_model_save_dir = "./static_outscale_infer_model"
+
+        imperative_out_scale = ImperativeQuantAware(
+            weight_quantize_type=weight_quantize_type,
+            activation_quantize_type=activation_quantize_type)
+
+        with fluid.dygraph.guard():
+            lenet = ImperativeLenet()
+            load_dict = paddle.load(load_param_path)
+            imperative_out_scale.quantize(lenet)
+            lenet.set_dict(load_dict)
+
+        imperative_out_scale.save_quantized_model(
+            layer=lenet,
+            path=path,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, 1, 28, 28], dtype='float32')
+            ])
+
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        exe = fluid.Executor(place)
+
+        # load dynamic model
+        [dynamic_inference_program, feed_target_names, fetch_targets] = (
+            fluid.io.load_inference_model(
+                dirname=dynamic_model_save_dir,
+                executor=exe,
+                model_filename="lenet" + INFER_MODEL_SUFFIX,
+                params_filename="lenet" + INFER_PARAMS_SUFFIX))
+        # load static model
+        [static_inference_program, feed_target_names, fetch_targets] = (
+            fluid.io.load_inference_model(
+                dirname=static_model_save_dir,
+                executor=exe,
+                model_filename="lenet" + INFER_MODEL_SUFFIX,
+                params_filename="lenet" + INFER_PARAMS_SUFFIX))
+
+        dynamic_ops = dynamic_inference_program.global_block().ops
+        static_ops = static_inference_program.global_block().ops
+
+        for op in dynamic_ops[:]:
+            if op.type == "flatten2" or 'fake' in op.type:
+                dynamic_ops.remove(op)
+
+        for op in static_ops[:]:
+            if 'fake' in op.type:
+                static_ops.remove(op)
+
+        op_count = 0
         for i in range(len(dynamic_ops)):
             if dynamic_ops[i].has_attr("out_threshold"):
+                op_count += 1
                 self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
                 self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
                                 static_ops[i].attr("out_threshold"))
 
+        _logger.info("op_cout: {}".format(op_count))
+        self.assertTrue(op_count == 14)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 96b3b67103b81a503e36db622d648d26ca832a50..99a23525409f3746982db402132db5d04f936bd4 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import os
 import numpy as np
 import random
+import shutil
+import time
 import unittest
 import logging
 import paddle
@@ -157,6 +159,20 @@ class TestImperativeQat(unittest.TestCase):
     QAT = quantization-aware training
     """
 
+    @classmethod
+    def setUpClass(cls):
+        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        cls.root_path = os.path.join(os.getcwd(), "imperative_qat_" + timestamp)
+        cls.save_path = os.path.join(cls.root_path, "lenet")
+        cls.dynamic_root_path = os.path.join(os.getcwd(),
+                                             "dynamic_mnist_" + timestamp)
+        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.root_path)
+        shutil.rmtree(cls.dynamic_root_path)
+
     def test_qat_save(self):
         imperative_qat = ImperativeQuantAware(
             weight_quantize_type='abs_max',
@@ -206,6 +222,8 @@ class TestImperativeQat(unittest.TestCase):
                             "Train | At epoch {} step {}: loss = {:}, acc= {:}".
                             format(epoch, batch_id,
                                    avg_loss.numpy(), acc.numpy()))
+                    if batch_id == 500:  # For shortening CI time
+                        break
 
                 lenet.eval()
                 for batch_id, data in enumerate(test_reader()):
@@ -242,11 +260,9 @@ class TestImperativeQat(unittest.TestCase):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./qat_infer_model/lenet"
-        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            path=path,
+            path=TestImperativeQat.save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -259,7 +275,7 @@ class TestImperativeQat(unittest.TestCase):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=save_dir,
+             dirname=TestImperativeQat.root_path,
              executor=exe,
              model_filename="lenet" + INFER_MODEL_SUFFIX,
              params_filename="lenet" + INFER_PARAMS_SUFFIX)
@@ -351,7 +367,7 @@ class TestImperativeQat(unittest.TestCase):
 
         paddle.jit.save(
             layer=lenet,
-            path="./dynamic_mnist/model",
+            path=TestImperativeQat.dynamic_save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
index 9d2b2d726e35ffd770a0507f68635755efc84572..f5b3e89ef415c113add9b04f65d3f27cd16244a1 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
@@ -17,6 +17,8 @@ from __future__ import print_function
 import os
 import numpy as np
 import random
+import shutil
+import time
 import unittest
 import logging
 import paddle
@@ -185,14 +187,29 @@ class ImperativeLenet(fluid.dygraph.Layer):
 
 
 class TestImperativeAddQuantDequant(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        cls.root_path = os.path.join(os.getcwd(),
+                                     "imperative_qat_aqd_" + timestamp)
+        cls.save_path = os.path.join(cls.root_path, "lenet")
+        cls.dynamic_root_path = os.path.join(os.getcwd(),
+                                             "dynamic_mnist_aqd_" + timestamp)
+        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
+
+    @classmethod
+    def tearDownClass(cls):
+        shutil.rmtree(cls.root_path)
+        shutil.rmtree(cls.dynamic_root_path)
+
     def test_qat_save(self):
 
         imperative_qat = ImperativeQuantAware(
             weight_quantize_type='abs_max',
             activation_quantize_type='moving_average_abs_max',
             quantizable_layer_type=[
-                'Conv2D', 'Linear', 'ReLU', 'Pool2D', 'LeakyReLU', 'ReLU6',
-                'Tanh', 'Swish'
+                'Conv2D', 'Linear', 'ReLU', 'LeakyReLU', 'ReLU6', 'Tanh',
+                'Swish'
             ])
 
         with fluid.dygraph.guard():
@@ -228,6 +245,8 @@ class TestImperativeAddQuantDequant(unittest.TestCase):
                             "Train | At epoch {} step {}: loss = {:}, acc= {:}".
                             format(epoch, batch_id,
                                    avg_loss.numpy(), acc.numpy()))
+                    if batch_id == 500:  # For shortening CI time
+                        break
 
                 lenet.eval()
                 for batch_id, data in enumerate(test_reader()):
@@ -264,11 +283,9 @@ class TestImperativeAddQuantDequant(unittest.TestCase):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        path = "./qat_infer_model/lenet"
-        save_dir = "./qat_infer_model"
         paddle.jit.save(
             layer=lenet,
-            path=path,
+            path=TestImperativeAddQuantDequant.save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
@@ -280,7 +297,7 @@ class TestImperativeAddQuantDequant(unittest.TestCase):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=save_dir,
+             dirname=TestImperativeAddQuantDequant.root_path,
              executor=exe,
              model_filename="lenet" + INFER_MODEL_SUFFIX,
              params_filename="lenet" + INFER_PARAMS_SUFFIX)
@@ -378,7 +395,7 @@ class TestImperativeAddQuantDequant(unittest.TestCase):
             lenet.eval()
         paddle.jit.save(
             layer=lenet,
-            path="./dynamic_mnist/model",
+            path=TestImperativeAddQuantDequant.dynamic_save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 0561055e6e05713b05095b6ad345426b8ac308bf..bda02769cea861908d90fa7ec44f64a696593987 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -200,9 +200,12 @@ class TestImperativeOutSclae(unittest.TestCase):
                 params_filename="lenet" + INFER_PARAMS_SUFFIX))
         model_ops = inference_program.global_block().ops
 
-        conv2d_count, mul_count = 0, 0
+        conv2d_count, matmul_count = 0, 0
+        conv2d_skip_count, matmul_skip_count = 0, 0
         for i, op in enumerate(model_ops):
             if op.type == 'conv2d':
+                if op.has_attr("skip_quant"):
+                    conv2d_skip_count += 1
                 if conv2d_count > 0:
                     self.assertTrue(
                         'fake_quantize_dequantize' in model_ops[i - 1].type)
@@ -211,14 +214,19 @@ class TestImperativeOutSclae(unittest.TestCase):
                         'fake_quantize_dequantize' not in model_ops[i - 1].type)
                 conv2d_count += 1
 
-            if op.type == 'mul':
-                if mul_count > 0:
+            if op.type == 'matmul':
+                if op.has_attr("skip_quant"):
+                    matmul_skip_count += 1
+                if matmul_count > 0:
                     self.assertTrue(
                         'fake_quantize_dequantize' in model_ops[i - 1].type)
                 else:
                     self.assertTrue(
                         'fake_quantize_dequantize' not in model_ops[i - 1].type)
-                mul_count += 1
+                matmul_count += 1
+
+        self.assertTrue(conv2d_skip_count == 1)
+        self.assertTrue(matmul_skip_count == 1)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index 3ea1c84f976a85850a2496218a248eb09ae20022..da5c5d6dc9441bbddd5104a1b9b2aa94880fda21 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -204,6 +204,66 @@ class TestPostTrainingKLForMnist(TestPostTrainingQuantization):
                       quant_iterations)
 
 
+class TestPostTraininghistForMnist(TestPostTrainingQuantization):
+    def test_post_training_hist(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "hist"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingmseForMnist(TestPostTrainingQuantization):
+    def test_post_training_mse(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "mse"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
+class TestPostTrainingavgForMnist(TestPostTrainingQuantization):
+    def test_post_training_avg(self):
+        model_name = "mnist_model"
+        data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz"
+        data_md5 = "be71d3997ec35ac2a65ae8a145e2887c"
+        algo = "avg"
+        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.01
+        batch_size = 10
+        infer_iterations = 50
+        quant_iterations = 5
+        self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold, batch_size, infer_iterations,
+                      quant_iterations)
+
+
 class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization):
     def test_post_training_abs_max(self):
         model_name = "mnist_model"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 18389d9433b9a5dd81e2f7e1725ce484a26d7a4a..71611048610060df736dd11bc6da7f6816cfd4b6 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -328,6 +328,50 @@ class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization):
                       diff_threshold)
 
 
+class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_avg_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "avg"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
+class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
+    def test_post_training_hist_mobilenetv1(self):
+        model = "MobileNet-V1"
+        algo = "hist"
+        data_urls = [
+            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+        ]
+        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        quantizable_op_type = [
+            "conv2d",
+            "depthwise_conv2d",
+            "mul",
+        ]
+        is_full_quantize = False
+        is_use_cache_file = False
+        is_optimize_model = True
+        diff_threshold = 0.025
+        self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type,
+                      is_full_quantize, is_use_cache_file, is_optimize_model,
+                      diff_threshold)
+
+
 class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
     def test_post_training_abs_max_mobilenetv1(self):
         model = "MobileNet-V1"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index 768a9ba7cfc3e769fe66c1deaffb1e60fc1a5689..790213d4b029247f6dc2efb09199f7ffc4857cde 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -257,6 +257,7 @@ class TestQuantizationFreezePass(unittest.TestCase):
                      use_cuda,
                      seed,
                      activation_quant_type,
+                     bias_correction=False,
                      weight_quant_type='abs_max',
                      for_ci=True,
                      quant_skip_pattern='skip_quant'):
@@ -355,7 +356,8 @@ class TestQuantizationFreezePass(unittest.TestCase):
 
         # Freeze graph for inference, but the weight of fc/conv is still float type.
         freeze_pass = QuantizationFreezePass(
-            scope=scope, place=place, weight_quantize_type=weight_quant_type)
+            scope=scope, place=place, bias_correction=bias_correction, \
+            weight_quantize_type=weight_quant_type)
         freeze_pass.apply(test_graph)
         if not for_ci:
             marked_nodes = set()
@@ -472,6 +474,13 @@ class TestQuantizationFreezePass(unittest.TestCase):
     def test_freeze_graph_cuda_static(self):
         if fluid.core.is_compiled_with_cuda():
             with fluid.unique_name.guard():
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='range_abs_max',
+                    bias_correction=True,
+                    weight_quant_type='abs_max',
+                    for_ci=True)
                 self.freeze_graph(
                     True,
                     seed=1,
@@ -496,6 +505,13 @@ class TestQuantizationFreezePass(unittest.TestCase):
                     activation_quant_type='moving_average_abs_max',
                     weight_quant_type='channel_wise_abs_max',
                     for_ci=True)
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    activation_quant_type='moving_average_abs_max',
+                    bias_correction=True,
+                    weight_quant_type='channel_wise_abs_max',
+                    for_ci=True)
 
     def test_freeze_graph_cpu_static(self):
         with fluid.unique_name.guard():
diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
index 1e8fa51d635e32d5d0169cf23ca0681051028ae9..744c97c514b3613d9642c3767ea4603adece68fd 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
@@ -15,6 +15,7 @@
 import unittest
 import os
 import time
+import numpy as np
 from paddle.dataset.common import download, DATA_HOME
 from paddle.fluid.contrib.slim.quantization import WeightQuantization
 import paddle
@@ -22,6 +23,28 @@ import paddle
 paddle.enable_static()
 
 
+def _load_variable_data(scope, var_name):
+    '''
+    Load variable value from scope
+    '''
+    var_node = scope.find_var(var_name)
+    assert var_node is not None, \
+        "Cannot find " + var_name + " in scope."
+    return np.array(var_node.get_tensor())
+
+
+def _set_variable_data(scope, place, var_name, np_value):
+    '''
+    Set the value of var node by name, if the node exits,
+    '''
+    assert isinstance(np_value, np.ndarray), \
+        'The type of value should be numpy array.'
+    var_node = scope.find_var(var_name)
+    if var_node != None:
+        tensor = var_node.get_tensor()
+        tensor.set(np_value, place)
+
+
 class TestWeightQuantization(unittest.TestCase):
     def setUp(self):
         self.weight_quantization_dir = 'weight_quantization'
@@ -45,18 +68,20 @@ class TestWeightQuantization(unittest.TestCase):
                                                           zip_path)
             os.system(cmd)
 
-    def run_test(self, model_name, model_data_url, model_data_md5, weight_bits,
-                 quantizable_op_type, weight_quantize_type, generate_test_model,
-                 threshold_rate):
+    def quantize_to_int(self, model_name, model_data_url, model_data_md5,
+                        weight_bits, quantizable_op_type, weight_quantize_type,
+                        generate_test_model, threshold_rate):
 
         model_dir = self.download_model(model_name, model_data_url,
                                         model_data_md5)
+        load_model_dir = os.path.join(model_dir, model_name)
 
         timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
         save_model_dir = os.path.join(
             os.getcwd(),
             model_name + "_wq_" + str(weight_bits) + "_" + timestamp)
-        weight_quant = WeightQuantization(model_dir=model_dir + "/model")
+
+        weight_quant = WeightQuantization(model_dir=load_model_dir)
         weight_quant.quantize_weight_to_int(
             save_model_dir=save_model_dir,
             weight_bits=weight_bits,
@@ -72,11 +97,79 @@ class TestWeightQuantization(unittest.TestCase):
             print("Failed to delete {} due to {}".format(save_model_dir, str(
                 e)))
 
+    def convert_to_fp16(self, model_name, model_data_url, model_data_md5,
+                        model_filename, params_filename):
+        model_dir = self.download_model(model_name, model_data_url,
+                                        model_data_md5)
+        load_model_dir = os.path.join(model_dir, model_name)
+
+        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
+        save_model_dir = os.path.join(os.getcwd(),
+                                      model_name + "_wq_fp16_" + timestamp)
+
+        weight_quant = WeightQuantization(load_model_dir, model_filename,
+                                          params_filename)
+
+        weight_quant.convert_weight_to_fp16(save_model_dir)
+
+        print("finish converting the data type of weights to fp16 for " +
+              model_name)
+        print("fp16 model saved in " + save_model_dir + "\n")
+
+        input_data = np.ones([1, 3, 224, 224], dtype=np.float32)
+        res_fp32 = self.run_models(load_model_dir, model_filename,
+                                   params_filename, input_data, False)
+        res_fp16 = self.run_models(save_model_dir, model_filename,
+                                   params_filename, input_data, True)
+
+        self.assertTrue(
+            np.allclose(
+                res_fp32, res_fp16, rtol=1e-5, atol=1e-08, equal_nan=True),
+            msg='Failed to test the accuracy of the fp32 and fp16 model.')
+
+        try:
+            os.system("rm -rf {}".format(save_model_dir))
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(save_model_dir, str(
+                e)))
+
+    def run_models(self, model_dir, model_filename, params_filename, input_data,
+                   is_fp16_model):
+        print(model_dir)
+
+        place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.Scope()
+        with paddle.static.scope_guard(scope):
+            [inference_program, feed_target_names, fetch_targets] = \
+                paddle.fluid.io.load_inference_model(model_dir, exe,
+                    model_filename=model_filename,
+                    params_filename=params_filename)
+
+        if is_fp16_model:
+            for var in inference_program.list_vars():
+                if (var.type == paddle.fluid.core.VarDesc.VarType.RAW) or \
+                    (not var.persistable) or (var.name in ['feed', 'fetch']) \
+                    or (var.dtype != paddle.fluid.core.VarDesc.VarType.FP16):
+                    continue
+                tensor = _load_variable_data(scope, var.name)
+                _set_variable_data(scope, place, var.name,
+                                   tensor.astype(np.float32))
+
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: input_data},
+                          fetch_list=fetch_targets)
+        return np.array(results[0])
+
 
 class TestWeightQuantizationMobilenetv1(TestWeightQuantization):
-    model_name = "mobilenetv1"
-    model_data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz"
-    model_data_md5 = "13892b0716d26443a8cdea15b3c6438b"
+    nocomb_model_name = "mobilenetv1_fp32_nocombined"
+    nocomb_model_data_url = "https://paddle-inference-dist.cdn.bcebos.com/Paddle-Inference-Demo/mobilenetv1_fp32_nocombined.tar.gz"
+    nocomb_model_data_md5 = "c9aae3b04d9d535c84590ae557be0a0b"
+
+    comb_model_name = "mobilenetv1_fp32_combined"
+    comb_model_data_url = "https://paddle-inference-dist.cdn.bcebos.com/Paddle-Inference-Demo/mobilenetv1_fp32_combined.tar.gz"
+    comb_model_data_md5 = "087c67e2b2b0a8b689fcc570a56c005f"
 
     def test_weight_quantization_mobilenetv1_8bit_abs_max(self):
         weight_bits = 8
@@ -84,9 +177,10 @@ class TestWeightQuantizationMobilenetv1(TestWeightQuantization):
         weight_quantize_type = "abs_max"
         generate_test_model = True
         threshold_rate = 0.0
-        self.run_test(self.model_name, self.model_data_url, self.model_data_md5,
-                      weight_bits, quantizable_op_type, weight_quantize_type,
-                      generate_test_model, threshold_rate)
+        self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, weight_bits,
+                             quantizable_op_type, weight_quantize_type,
+                             generate_test_model, threshold_rate)
 
     def test_weight_quantization_mobilenetv1_8bit_channel_wise_abs_max(self):
         weight_bits = 8
@@ -94,19 +188,21 @@ class TestWeightQuantizationMobilenetv1(TestWeightQuantization):
         weight_quantize_type = "channel_wise_abs_max"
         generate_test_model = True
         threshold_rate = 0.0
-        self.run_test(self.model_name, self.model_data_url, self.model_data_md5,
-                      weight_bits, quantizable_op_type, weight_quantize_type,
-                      generate_test_model, threshold_rate)
+        self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, weight_bits,
+                             quantizable_op_type, weight_quantize_type,
+                             generate_test_model, threshold_rate)
 
     def test_weight_quantization_mobilenetv1_16bit_abs_max(self):
         weight_bits = 16
         quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul']
         weight_quantize_type = "abs_max"
         generate_test_model = False
-        threshold_rate = 1e-9
-        self.run_test(self.model_name, self.model_data_url, self.model_data_md5,
-                      weight_bits, quantizable_op_type, weight_quantize_type,
-                      generate_test_model, threshold_rate)
+        threshold_rate = 0
+        self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, weight_bits,
+                             quantizable_op_type, weight_quantize_type,
+                             generate_test_model, threshold_rate)
 
     def test_weight_quantization_mobilenetv1_16bit_channel_wise_abs_max(self):
         weight_bits = 16
@@ -114,9 +210,24 @@ class TestWeightQuantizationMobilenetv1(TestWeightQuantization):
         weight_quantize_type = "channel_wise_abs_max"
         generate_test_model = False
         threshold_rate = 1e-9
-        self.run_test(self.model_name, self.model_data_url, self.model_data_md5,
-                      weight_bits, quantizable_op_type, weight_quantize_type,
-                      generate_test_model, threshold_rate)
+        self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, weight_bits,
+                             quantizable_op_type, weight_quantize_type,
+                             generate_test_model, threshold_rate)
+
+    def test_mobilenetv1_fp16_combined(self):
+        model_filename = '__model__'
+        params_filename = '__params__'
+        self.convert_to_fp16(self.comb_model_name, self.comb_model_data_url,
+                             self.comb_model_data_md5, model_filename,
+                             params_filename)
+
+    def test_mobilenetv1_fp16_nocombined(self):
+        model_filename = None
+        params_filename = None
+        self.convert_to_fp16(self.nocomb_model_name, self.nocomb_model_data_url,
+                             self.nocomb_model_data_md5, model_filename,
+                             params_filename)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/tests/test_bf16_utils.py b/python/paddle/fluid/contrib/tests/test_bf16_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..41aa5e5412df58b084835c7f0019bb8ca06bcaf9
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_bf16_utils.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+import unittest
+import paddle.fluid as fluid
+import paddle.static.amp as amp
+from paddle.fluid import core
+import paddle
+
+paddle.enable_static()
+
+
+class AMPTest(unittest.TestCase):
+    def setUp(self):
+        self.bf16_list = copy.copy(amp.bf16.amp_lists.bf16_list)
+        self.fp32_list = copy.copy(amp.bf16.amp_lists.fp32_list)
+        self.gray_list = copy.copy(amp.bf16.amp_lists.gray_list)
+        self.amp_lists_ = None
+
+    def tearDown(self):
+        self.assertEqual(self.amp_lists_.bf16_list, self.bf16_list)
+        self.assertEqual(self.amp_lists_.fp32_list, self.fp32_list)
+        self.assertEqual(self.amp_lists_.gray_list, self.gray_list)
+
+    def test_amp_lists(self):
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16()
+
+    def test_amp_lists_1(self):
+        # 1. w={'exp}, b=None
+        self.bf16_list.add('exp')
+        self.fp32_list.remove('exp')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'exp'})
+
+    def test_amp_lists_2(self):
+        # 2. w={'tanh'}, b=None
+        self.fp32_list.remove('tanh')
+        self.bf16_list.add('tanh')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'tanh'})
+
+    def test_amp_lists_3(self):
+        # 3. w={'lstm'}, b=None
+        self.bf16_list.add('lstm')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'lstm'})
+
+    def test_amp_lists_4(self):
+        # 4. w=None, b={'elementwise_add'}
+        self.bf16_list.remove('elementwise_add')
+        self.fp32_list.add('elementwise_add')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'elementwise_add'})
+
+    def test_amp_lists_5(self):
+        # 5. w=None, b={'elementwise_add'}
+        self.fp32_list.add('elementwise_add')
+        self.bf16_list.remove('elementwise_add')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'elementwise_add'})
+
+    def test_amp_lists_6(self):
+        # 6. w=None, b={'lstm'}
+        self.fp32_list.add('lstm')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'lstm'})
+
+    def test_amp_lists_7(self):
+        self.fp32_list.add('reshape2')
+        self.gray_list.remove('reshape2')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_list={'reshape2'})
+
+    def test_amp_list_8(self):
+        self.bf16_list.add('reshape2')
+        self.gray_list.remove('reshape2')
+
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_bf16_list={'reshape2'})
+
+
+class AMPTest2(unittest.TestCase):
+    def test_amp_lists_(self):
+        # 7. w={'lstm'} b={'lstm'}
+        # raise ValueError
+        self.assertRaises(ValueError, amp.bf16.AutoMixedPrecisionListsBF16,
+                          {'lstm'}, {'lstm'})
+
+    def test_find_op_index(self):
+        block = fluid.default_main_program().global_block()
+        op_desc = core.OpDesc()
+        idx = amp.bf16.amp_utils.find_op_index(block.desc, op_desc)
+        assert (idx == -1)
+
+    def test_is_in_fp32_varnames(self):
+        block = fluid.default_main_program().global_block()
+
+        var1 = block.create_var(name="X", shape=[3], dtype='float32')
+        var2 = block.create_var(name="Y", shape=[3], dtype='float32')
+        var3 = block.create_var(name="Z", shape=[3], dtype='float32')
+        op1 = block.append_op(
+            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
+        op2 = block.append_op(
+            type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]})
+        amp_lists_1 = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_varnames={'X'})
+        assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_1)
+        amp_lists_2 = amp.bf16.AutoMixedPrecisionListsBF16(
+            custom_fp32_varnames={'Y'})
+        assert amp.bf16.amp_utils._is_in_fp32_varnames(op2, amp_lists_2)
+        assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_2)
+
+    def test_find_true_post_op(self):
+
+        block = fluid.default_main_program().global_block()
+
+        var1 = block.create_var(name="X", shape=[3], dtype='float32')
+        var2 = block.create_var(name="Y", shape=[3], dtype='float32')
+        var3 = block.create_var(name="Z", shape=[3], dtype='float32')
+        op1 = block.append_op(
+            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
+        op2 = block.append_op(
+            type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]})
+        res = amp.bf16.amp_utils.find_true_post_op(block.ops, op1, "Y")
+        assert (res == [op2])
+
+    def test_find_true_post_op_with_search_all(self):
+        program = fluid.Program()
+        block = program.current_block()
+        startup_block = fluid.default_startup_program().global_block()
+
+        var1 = block.create_var(name="X", shape=[3], dtype='float32')
+        var2 = block.create_var(name="Y", shape=[3], dtype='float32')
+        inititializer_op = startup_block._prepend_op(
+            type="fill_constant",
+            outputs={"Out": var1},
+            attrs={"shape": var1.shape,
+                   "dtype": var1.dtype,
+                   "value": 1.0})
+
+        op1 = block.append_op(
+            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
+        result = amp.bf16.amp_utils.find_true_post_op(
+            block.ops, inititializer_op, "X", search_all=False)
+        assert (len(result) == 0)
+        result = amp.bf16.amp_utils.find_true_post_op(
+            block.ops, inititializer_op, "X", search_all=True)
+        assert (result == [op1])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_distributed_reader.py b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
deleted file mode 100644
index b964168eb3a2f14fa6dd55d189592daa6ec93d3c..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/tests/test_distributed_reader.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import os
-
-
-def data_generator():
-    data = [0, 1, 2, 3]
-    for val in data:
-        yield val
-
-
-class TestDistributedReader(unittest.TestCase):
-    def test_distributed_reader(self):
-        trainer_num = 4
-        os.environ['PADDLE_TRAINER_ID'] = str(1)
-        os.environ['PADDLE_TRAINERS_NUM'] = str(trainer_num)
-
-        reader = fluid.contrib.reader.distributed_batch_reader(data_generator)
-        data = next(reader())
-        assert data == 1
-
-        #Note: windows python3 don't have unsetenv
-        del os.environ['PADDLE_TRAINER_ID']
-        del os.environ['PADDLE_TRAINERS_NUM']
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
new file mode 100644
index 0000000000000000000000000000000000000000..470073543c3be504498a8ad974b4d54a00038d95
--- /dev/null
+++ b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
@@ -0,0 +1,158 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+import contextlib
+import unittest
+import numpy as np
+import paddle.fluid.layers as layers
+import paddle.static.amp as amp
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestModelCastBF16(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.seed = 111
+
+    @classmethod
+    def tearDownClass(cls):
+        pass
+
+    @contextlib.contextmanager
+    def static_graph(self):
+        with self.scope_prog_guard():
+            paddle.seed(self.seed)
+            paddle.framework.random._manual_program_seed(self.seed)
+            yield
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+    def get_static_graph_result(self,
+                                feed,
+                                fetch_list,
+                                amp_fun,
+                                with_lod=False,
+                                startup_prog=None):
+        exe = fluid.Executor(core.CPUPlace())
+        exe.run(fluid.default_startup_program()
+                if startup_prog is None else startup_prog)
+        prog = fluid.default_main_program()
+        if amp_fun is not None:
+            if startup_prog is not None:
+                amp_fun(prog, startup_prog)
+            else:
+                amp_fun(prog)
+        return exe.run(prog,
+                       feed=feed,
+                       fetch_list=fetch_list,
+                       return_numpy=(not with_lod))
+
+    def _graph_common(self, _amp_fun, startup_prog=None):
+        size = 3
+        n = np.ones([size, size], dtype='float32') * 3.2
+        nn = np.ones([size, size], dtype='float32') * -2.7
+
+        n_bf16 = amp.bf16.convert_float_to_uint16(n)
+        nn_bf16 = amp.bf16.convert_float_to_uint16(nn)
+
+        with self.static_graph():
+            t_bf16 = layers.data(
+                name='t_bf16', shape=[size, size], dtype=np.uint16)
+            tt_bf16 = layers.data(
+                name='tt_bf16', shape=[size, size], dtype=np.uint16)
+            t = layers.data(name='t', shape=[size, size], dtype='float32')
+            tt = layers.data(name='tt', shape=[size, size], dtype='float32')
+
+            ret = layers.elementwise_add(t, tt)
+            ret = layers.elementwise_mul(ret, t)
+            ret = layers.reshape(ret, [0, 0])
+
+            with amp.bf16.bf16_guard():
+                ret_bf16 = layers.elementwise_add(t_bf16, tt_bf16)
+                ret_bf16 = layers.elementwise_mul(ret_bf16, t_bf16)
+                ret_bf16 = layers.reshape(ret_bf16, [0, 0])
+
+            with amp.bf16.bf16_guard():
+                ret_fp32bf16 = layers.elementwise_add(t, tt)
+                ret_fp32bf16 = layers.elementwise_mul(ret_fp32bf16, t)
+                ret_fp32bf16 = layers.reshape(ret_fp32bf16, [0, 0])
+
+            static_ret_bf16, static_ret, ret_fp32bf16 = self.get_static_graph_result(
+                feed={
+                    't': n,
+                    'tt': nn,
+                    't_bf16': n_bf16,
+                    'tt_bf16': nn_bf16,
+                },
+                fetch_list=[ret_bf16, ret, ret_fp32bf16],
+                amp_fun=lambda prog: amp.bf16.rewrite_program_bf16(prog))
+
+        self.assertTrue(np.allclose(static_ret_bf16, static_ret, 1e-2))
+        self.assertTrue(np.allclose(static_ret_bf16, ret_fp32bf16, 1e-2))
+
+        with self.static_graph():
+            t = layers.data(name='t', shape=[size, size], dtype='float32')
+            tt = layers.data(name='tt', shape=[size, size], dtype='float32')
+
+            with amp.bf16.bf16_guard():
+                ret = layers.elementwise_add(t, tt)
+                ret = layers.reshape(ret, [0, 0], act='elu')
+                ret = layers.elementwise_mul(ret, t)
+            ret = layers.elementwise_add(ret, tt)
+
+            static_ret_bf16 = \
+                self.get_static_graph_result(
+                    feed={'t': n, 'tt': nn},
+                    fetch_list=[ret],
+                    amp_fun=_amp_fun,
+                    startup_prog=startup_prog
+                )
+        self.assertTrue(
+            static_ret_bf16, np.ones(
+                [size, size], dtype='float32') * -1.1)
+
+    def test_graph_rewrite(self):
+        self._graph_common(lambda prog: amp.bf16.rewrite_program_bf16(
+            prog,
+            amp.bf16.AutoMixedPrecisionListsBF16(
+                custom_fp32_varnames={'elementwise_add_0.tmp_0'})
+        ))
+
+    def test_graph_cast(self):
+        self._graph_common(lambda prog, startup_prog: amp.bf16.cast_model_to_bf16(
+            prog,
+            startup_prog,
+            amp.bf16.AutoMixedPrecisionListsBF16(
+                custom_fp32_list={'elementwise_mul'}),
+            use_bf16_guard=True
+        ), startup_prog=fluid.default_startup_program())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
index b190a5d02efc4ce34a7062f1bf3e2ad1939c9399..850b267411ed5d98d21f8dd0cc14ad76fd9b641c 100644
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
@@ -258,7 +258,8 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase):
         cast_model_to_fp16(main_prog, use_fp16_guard=False)
 
     def test_non_iterable_dataloader(self):
-        self.decorate_with_data_loader()
+        if fluid.core.is_compiled_with_cuda():
+            self.decorate_with_data_loader()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
deleted file mode 100644
index 9572552f0f2be4025973b5720caafae5e6363a51..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ /dev/null
@@ -1,603 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""hdfs_utils.py will move to fluid/incubate/fleet/utils/hdfs.py"""
-
-import os
-import sys
-import subprocess
-import multiprocessing
-from datetime import datetime
-
-import re
-import copy
-import errno
-
-import logging
-from paddle.fluid.log_helper import get_logger
-
-__all__ = ["HDFSClient", "multi_download", "multi_upload"]
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class HDFSClient(object):
-    r"""
-    A tool of HDFS 
-
-    Args:
-        hadoop_home (string): hadoop_home 
-        configs (dict): hadoop config, it is a dict, please contain \
-            key "fs.default.name" and "hadoop.job.ugi"
-        Can be a float value
-    Examples:
-        hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-        configs = {
-            "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-            "hadoop.job.ugi": "hello,hello123"
-        }
-
-        client = HDFSClient(hadoop_home, configs)
-
-        client.ls("/user/com/train-25")
-        files = client.lsr("/user/com/train-25/models")
-    """
-
-    def __init__(self, hadoop_home, configs):
-        self.pre_commands = []
-        hadoop_bin = '%s/bin/hadoop' % hadoop_home
-        self.pre_commands.append(hadoop_bin)
-        dfs = 'fs'
-        self.pre_commands.append(dfs)
-
-        for k, v in configs.items():
-            config_command = '-D%s=%s' % (k, v)
-            self.pre_commands.append(config_command)
-
-    def __run_hdfs_cmd(self, commands, retry_times=5):
-        whole_commands = copy.deepcopy(self.pre_commands)
-        whole_commands.extend(commands)
-
-        print('Running system command: {0}'.format(' '.join(whole_commands)))
-
-        ret_code = 0
-        ret_out = None
-        ret_err = None
-        whole_commands = " ".join(whole_commands)
-        for x in range(retry_times + 1):
-            proc = subprocess.Popen(
-                whole_commands,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                shell=True)
-            (output, errors) = proc.communicate()
-            ret_code, ret_out, ret_err = proc.returncode, output, errors
-            if ret_code:
-                _logger.warn(
-                    'Times: %d, Error running command: %s. Return code: %d, Error: %s'
-                    % (x, ' '.join(whole_commands), proc.returncode, errors))
-            else:
-                break
-        return ret_code, ret_out, ret_err
-
-    def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
-        """
-        upload the local file to hdfs
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            retry_times(int|5): retry times
-
-        Returns:
-                True or False
-        """
-        assert hdfs_path is not None
-        assert local_path is not None and os.path.exists(local_path)
-
-        if os.path.isdir(local_path):
-            _logger.warn(
-                "The Local path: {} is dir and I will support it later, return".
-                format(local_path))
-            return False
-
-        base = os.path.basename(local_path)
-        if not self.is_exist(hdfs_path):
-            self.makedirs(hdfs_path)
-        else:
-            if self.is_exist(os.path.join(hdfs_path, base)):
-                if overwrite:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is True, delete it".
-                        format(hdfs_path))
-                    self.delete(hdfs_path)
-                else:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is False, return".
-                        format(hdfs_path))
-                    return False
-
-        put_commands = ["-put", local_path, hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(put_commands,
-                                                         retry_times)
-        if returncode:
-            _logger.error("Put local path: {} to HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Put local path: {} to HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
-        """
-        download file from HDFS
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            unzip(bool|False): if the download file is compressed by zip, unzip it or not.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Downloading %r to %r.', hdfs_path, local_path)
-        _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
-
-        if not self.is_exist(hdfs_path):
-            print("HDFS path: {} do not exist".format(hdfs_path))
-            return False
-        if self.is_dir(hdfs_path):
-            _logger.error(
-                "The HDFS path: {} is dir and I will support it later, return".
-                format(hdfs_path))
-
-        if os.path.exists(local_path):
-            base = os.path.basename(hdfs_path)
-            local_file = os.path.join(local_path, base)
-            if os.path.exists(local_file):
-                if overwrite:
-                    os.remove(local_file)
-                else:
-                    _logger.error(
-                        "The Local path: {} is exist and overwrite is False, return".
-                        format(local_file))
-                    return False
-
-        self.make_local_dirs(local_path)
-
-        download_commands = ["-get", hdfs_path, local_path]
-        returncode, output, errors = self.__run_hdfs_cmd(download_commands)
-        if returncode:
-            _logger.error("Get local path: {} from HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Get local path: {} from HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def is_exist(self, hdfs_path=None):
-        """
-        whether the remote HDFS path exists
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-        exist_cmd = ['-test', '-e', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            exist_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS is_exist HDFS path: {} failed".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
-                hdfs_path))
-            return True
-
-    def is_dir(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is directory
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-
-        if not self.is_exist(hdfs_path):
-            return False
-
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS path: {} failed is not a directory".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} successfully is a directory".format(
-                hdfs_path))
-            return True
-
-    def delete(self, hdfs_path):
-        """
-        Remove a file or directory from HDFS.
-
-        whether the remote HDFS path exists
-
-        Args:
-        hdfs_path: HDFS path.
-
-        Returns:
-            True or False
-            This function returns `True` if the deletion was successful and `False` if
-            no file or directory previously existed at `hdfs_path`.
-        """
-        _logger.info('Deleting %r.', hdfs_path)
-
-        if not self.is_exist(hdfs_path):
-            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
-            return True
-
-        if self.is_dir(hdfs_path):
-            del_cmd = ['-rmr', hdfs_path]
-        else:
-            del_cmd = ['-rm', hdfs_path]
-
-        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
-
-        if returncode:
-            _logger.error("HDFS path: {} delete files failure".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} delete files successfully".format(
-                hdfs_path))
-            return True
-
-    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
-        """
-        Move a file or folder on HDFS.
-
-        Args:
-        hdfs_path(str): HDFS path.
-        overwrite(bool|False): If the path already exists and overwrite is False, will return False.
-
-        Returns:
-            True or False
-        """
-        assert hdfs_src_path is not None
-        assert hdfs_dst_path is not None
-
-        if not self.is_exist(hdfs_src_path):
-            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
-        if self.is_exist(hdfs_dst_path) and not overwrite:
-            _logger.error("HDFS path is exist: {} and overwrite=False".format(
-                hdfs_dst_path))
-
-        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            rename_command, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS rename path: {} to {} failed".format(
-                hdfs_src_path, hdfs_dst_path))
-            return False
-        else:
-            _logger.info("HDFS rename path: {} to {} successfully".format(
-                hdfs_src_path, hdfs_dst_path))
-            return True
-
-    @staticmethod
-    def make_local_dirs(local_path):
-        """
-        create a directory local, is same to mkdir
-        Args:
-            local_path: local path that wants to create a directory.
-        """
-        try:
-            os.makedirs(local_path)
-        except OSError as e:
-            if e.errno != errno.EEXIST:
-                raise
-
-    def makedirs(self, hdfs_path):
-        """
-        Create a remote directory, recursively if necessary.
-
-        Args:
-        hdfs_path(str): Remote path. Intermediate directories will be created appropriately.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Creating directories to %r.', hdfs_path)
-        assert hdfs_path is not None
-
-        if self.is_exist(hdfs_path):
-            _logger.error("HDFS path is exist: {}".format(hdfs_path))
-            return
-
-        mkdirs_commands = ['-mkdir', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            mkdirs_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
-            return False
-        else:
-            _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
-            return True
-
-    def ls(self, hdfs_path):
-        """
-        ls directory contents about HDFS hdfs_path
-
-        Args:
-        hdfs_path(str): Remote HDFS path will be ls.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-ls', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list path: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
-
-            ret_lines = []
-            regex = re.compile(r'\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    ret_lines.append(re_line[7])
-            return ret_lines
-
-    def lsr(self, hdfs_path, only_file=True, sort=True):
-        """
-        list directory contents about HDFS hdfs_path recursively
-
-        Args:
-        hdfs_path(str): Remote HDFS path.
-        only_file(bool|True): will discard folders.
-        sort(bool|True): will be sorted by create time.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-
-        def sort_by_time(v1, v2):
-            v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
-            v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
-            return v1_time > v2_time
-
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-lsr', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list all files: {} successfully".format(
-                hdfs_path))
-            lines = []
-            regex = re.compile(r'\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    if only_file and re_line[0][0] == "d":
-                        continue
-                    else:
-                        lines.append(
-                            (re_line[7], re_line[5] + " " + re_line[6]))
-            if sort:
-                sorted(lines, cmp=sort_by_time)
-            ret_lines = [ret[0] for ret in lines]
-            return ret_lines
-
-
-def multi_download(client,
-                   hdfs_path,
-                   local_path,
-                   trainer_id,
-                   trainers,
-                   multi_processes=5):
-    """
-    Download files from HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        trainer_id(int): current trainer id
-        trainers(int): all trainers number
-        multi_processes(int|5): the download data process at the same time, default=5
-
-    Returns:
-        List:
-        Download files in local folder.
-    """
-
-    def __subprocess_download(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-            if re_path == os.curdir:
-                sub_local_re_path = local_path
-            else:
-                sub_local_re_path = os.path.join(local_path, re_path)
-            client.download(data, sub_local_re_path)
-
-    assert isinstance(client, HDFSClient)
-
-    client.make_local_dirs(local_path)
-    _logger.info("Make local dir {} successfully".format(local_path))
-
-    all_need_download = client.lsr(hdfs_path, sort=True)
-    need_download = all_need_download[trainer_id::trainers]
-    _logger.info("Get {} files From all {} files need to be download from {}".
-                 format(len(need_download), len(all_need_download), hdfs_path))
-
-    _logger.info("Start {} multi process to download datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = need_download[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_download, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to download datas".format(
-        multi_processes))
-
-    local_downloads = []
-    for data in need_download:
-        data_name = os.path.basename(data)
-        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-        if re_path == os.curdir:
-            local_re_path = os.path.join(local_path, data_name)
-        else:
-            local_re_path = os.path.join(local_path, re_path, data_name)
-        local_downloads.append(local_re_path)
-
-    return local_downloads
-
-
-def getfilelist(path):
-    rlist = []
-    for dir, folder, file in os.walk(path):
-        for i in file:
-            t = os.path.join(dir, i)
-            rlist.append(t)
-    for r in rlist:
-        print(r)
-
-
-def multi_upload(client,
-                 hdfs_path,
-                 local_path,
-                 multi_processes=5,
-                 overwrite=False,
-                 sync=True):
-    """
-    Upload files to HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        multi_processes(int|5): the upload data process at the same time, default=5
-        overwrite(bool|False): will overwrite file on HDFS or not
-        sync(bool|True): upload files sync or not.
-
-    Returns:
-        None
-    """
-
-    def __subprocess_upload(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), local_path)
-            hdfs_re_path = os.path.join(hdfs_path, re_path)
-            client.upload(hdfs_re_path, data, overwrite, retry_times=5)
-
-    def get_local_files(path):
-        rlist = []
-
-        if not os.path.isdir(path):
-            return rlist
-
-        for dirname, folder, files in os.walk(path):
-            for i in files:
-                t = os.path.join(dirname, i)
-                rlist.append(t)
-        return rlist
-
-    assert isinstance(client, HDFSClient)
-
-    all_files = get_local_files(local_path)
-    if not all_files:
-        _logger.info("there are nothing need to upload, exit")
-        return
-    _logger.info("Start {} multi process to upload datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = all_files[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_upload, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to upload datas".format(
-        multi_processes))
-
-
-if __name__ == "__main__":
-    hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-    configs = {
-        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-        "hadoop.job.ugi": "hello,hello123"
-    }
-
-    client = HDFSClient(hadoop_home, configs)
-
-    client.ls("/user/com/train-25")
-    files = client.lsr("/user/com/train-25/models")
-
-    downloads = multi_download(
-        client,
-        "/user/com/train-25/model",
-        "/home/xx/data1",
-        1,
-        5,
-        100,
-        multi_processes=5)
-
-    multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
deleted file mode 100644
index 7d30de565e7a41b02cbf37893f561283eef29b3a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lookup_table_utils.py will move to fluid/incubate/fleet/utils/lookup_table.py"""
-
-from __future__ import print_function
-
-import os
-import time
-import logging
-
-import paddle
-from paddle.fluid import core
-from paddle.fluid import io
-from paddle.fluid import Program
-from paddle.fluid.log_helper import get_logger
-
-__all__ = [
-    "load_persistables_for_increment", "load_persistables_for_inference",
-    "convert_dist_to_sparse_program"
-]
-
-_logger = get_logger(
-    'lookup_table_utils',
-    logging.INFO,
-    fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-model_filename = "__model__"
-lookup_table_dir = "__lookup_table__"
-
-
-def __insert_lookup_sparse_table_op(main_program, idx, ids, w, out):
-    main_program.global_block()._insert_op(
-        index=idx,
-        type="lookup_sparse_table",
-        inputs={"Ids": [ids],
-                "W": [w]},
-        outputs={"Out": [out]},
-        attrs={
-            "is_distributed": False,
-            "is_sparse": True,
-            "grad_inplace": False
-        })
-
-
-def __get_prefetch_op_tuples(main_program):
-    # current lookup tables op is split_ids->prefetch->merge_ids
-    prefetch_op_tuples = None
-    op_types = [op.type for op in main_program.global_block().ops]
-
-    for i in range(len(op_types)):
-        if op_types[i] == "prefetch":
-            if op_types[i - 1] == "split_ids" and op_types[i +
-                                                           1] == "merge_ids":
-                split_ids_op_id = i - 1
-                split_ids_inputs = main_program.global_block().ops[i - 1].input(
-                    "Ids")
-                prefetch_op_inputs = main_program.global_block().ops[i].input(
-                    "X")
-                prefetch_op_outputs = main_program.global_block().ops[i].output(
-                    "Out")
-                merge_ids_outputs = main_program.global_block().ops[
-                    i + 1].output("Out")
-
-                need_delete_vars = []
-                need_delete_vars.extend(prefetch_op_inputs)
-                need_delete_vars.extend(prefetch_op_outputs)
-
-                prefetch_op_tuples = (split_ids_op_id, split_ids_inputs,
-                                      merge_ids_outputs, need_delete_vars)
-                break
-    return prefetch_op_tuples
-
-
-def convert_dist_to_sparse_program(program):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    when we train model with distributed lookup table but want to do the local inference, we can use
-    this function to convert the train program with distributed lookup table to sparse lookup table.
-
-    Args:
-        program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
-    Returns:
-        program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
-    """
-    if not program._distributed_lookup_table:
-        _logger.warn(
-            "There are no distributed lookup tables need to be converted")
-        return
-
-    # create table param and grad var in pserver program
-    origin_emb_var = "{}.origin".format(program._distributed_lookup_table)
-    emb_var = program._distributed_lookup_table
-    program.global_block()._rename_var(emb_var, origin_emb_var)
-    origin_param_var = program.global_block().vars[origin_emb_var]
-
-    param_var = program.global_block().create_var(
-        name=emb_var,
-        shape=origin_param_var.shape,
-        dtype=origin_param_var.dtype,
-        type=core.VarDesc.VarType.SELECTED_ROWS,
-        persistable=True)
-    # parameter must be selected rows
-    param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-    program._sync_with_cpp()
-
-    prefetch_op_tuples = __get_prefetch_op_tuples(program)
-
-    split_ids_id = prefetch_op_tuples[0]
-
-    for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
-        program.global_block()._remove_op(idx)
-    program.desc.flush()
-
-    in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
-
-    for in_out_pair in in_out_pairs:
-        idx = split_ids_id
-        ids = program.global_block().vars[in_out_pair[0]]
-        out = program.global_block().vars[in_out_pair[1]]
-        __insert_lookup_sparse_table_op(program, idx, ids, param_var, out)
-        program.desc.flush()
-    return program
-
-
-def load_persistables_for_increment(dirname, executor, program,
-                                    lookup_table_var, lookup_table_var_path):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    for increment training, the pserver will not only load dense variables,
-    but also load the suitable lookup table var. Because of sliced lookup table
-    var with HASH, we must load the correct sliced var.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var: the distributed lookup tables var name.
-        lookup_table_var_path: the the distributed lookup tables var location.
-
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, need_load_vars):
-        load_prog = Program()
-        load_block = load_prog.global_block()
-        need_delete_vars = []
-
-        for param in need_load_vars:
-            origin_var = param.origin
-            slice_var = param.slice
-            is_slice = param.is_slice
-            offset = param.offset
-
-            if is_slice:
-                origin = load_block.create_var(
-                    name="{}.load".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-                slice = load_block.create_var(
-                    name=slice_var.name,
-                    type=slice_var.type,
-                    shape=slice_var.shape,
-                    dtype=slice_var.dtype,
-                    persistable=True)
-
-                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
-                start = int(offset / dim1_flatten)
-                end = int(offset / dim1_flatten + slice.shape[0])
-
-                load_block.append_op(
-                    type="slice",
-                    inputs={'Input': origin},
-                    outputs={'Out': slice},
-                    attrs={'axes': [0],
-                           'starts': [start],
-                           'ends': [end]})
-
-                need_delete_vars.append(origin)
-            else:
-                origin = load_block.create_var(
-                    name="{}".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-        load_block.append_op(
-            type='delete_var',
-            inputs={'X': need_delete_vars}, )
-
-        executor.run(load_prog)
-
-    def __load_lookup_table_vars(executor, main_program, lookup_table_var,
-                                 lookup_table_var_path):
-        emb_var = main_program.global_block().var(lookup_table_var)
-
-        load_program = Program()
-        load_block = load_program.global_block()
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [emb_var]},
-            attrs={'file_path': lookup_table_var_path})
-        executor.run(load_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if not os.path.exists(lookup_table_var_path):
-        raise ValueError("There is no file named '%s'", lookup_table_var_path)
-
-    if not isinstance(program, Program):
-        raise ValueError("program must be an instance of fluid.Program")
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    need_load_vars = program._parameters_on_pservers.get_distributed_vars_by_ep(
-        program._ps_endpoint)
-    _load_persistable_vars(executor, dirname, need_load_vars)
-    __load_lookup_table_vars(executor, program, lookup_table_var,
-                             lookup_table_var_path)
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-
-def load_persistables_for_inference(dirname, executor, program,
-                                    lookup_table_var_name):
-    """
-    WARNING: this function will only be used for inference with distributed lookup table.
-    Inference with distributed lookup table is a little funky, this function will load distributed
-    lookup table vars into sparse var, can be used in local inference mode.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var_name: the distributed lookup tables var name.
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
-        def _is_checkpoint_var(exclude_fluid_vars=None):
-            """
-            the checkpoint will not save or load all the variables.
-            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-            : param var(Variable)
-            """
-
-            if exclude_fluid_vars is None:
-                exclude_fluid_vars = []
-
-            def is_valid(var):
-                if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                        var.desc.type() == core.VarDesc.VarType.RAW:
-                    return False
-                # @GRAD are named for gradient variables, checkpoint will not save it.
-                if "@GRAD" in var.name:
-                    return False
-                # .trainer_ are named for distribute train variables, checkpoint will not save it.
-                if ".trainer_" in var.name:
-                    return False
-
-                # .block is named for distribute train variables, checkpoint will not save it.
-                if ".block" in var.name:
-                    return False
-
-                if "tmp_" in var.name:
-                    return False
-
-                if var.name in exclude_fluid_vars:
-                    return False
-
-                return var.persistable
-
-            return is_valid
-
-        io.load_vars(
-            executor,
-            dirname=dirname,
-            main_program=program,
-            predicate=_is_checkpoint_var(lookup_table_vars),
-            filename=None)
-
-    def _load_lookup_table_vars(executor, dirname, main_program,
-                                lookup_table_vars):
-        if not os.path.isdir(dirname):
-            raise ValueError("There is no directory named '%s'", dirname)
-
-        lookup_table_dirname = os.path.join(dirname, lookup_table_dir)
-
-        emb_var_name = lookup_table_vars[0]
-        emb_var = main_program.global_block().var(emb_var_name)
-
-        emb_files = []
-        for emb_name in os.listdir(lookup_table_dirname):
-            if emb_var_name in emb_name:
-                emb_files.append(emb_name)
-
-        convert_program = Program()
-        global_block = convert_program.global_block()
-
-        emb_var = global_block.create_var(
-            name=emb_var.name,
-            shape=emb_var.shape,
-            dtype=emb_var.dtype,
-            type=core.VarDesc.VarType.SELECTED_ROWS,
-            persistable=True)
-        emb_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-
-        sums = []
-
-        for i, emb_file in enumerate(emb_files):
-            var_name = "{}_{}".format(emb_var.name, i)
-            param_var = global_block.create_var(
-                name=var_name,
-                shape=emb_var.shape,
-                dtype=emb_var.dtype,
-                type=core.VarDesc.VarType.SELECTED_ROWS,
-                persistable=True)
-            param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-            global_block.append_op(
-                type='load',
-                inputs={},
-                outputs={'Out': [param_var]},
-                attrs={
-                    'file_path': os.path.join(lookup_table_dirname, var_name)
-                })
-            sums.append(param_var)
-        global_block.append_op(
-            type='merge_sparse_lookup_table',
-            inputs={"X": sums},
-            outputs={'Out': emb_var},
-            attrs={})
-        global_block.append_op(
-            type='save',
-            inputs={"X": [emb_var]},
-            outputs={},
-            attrs={
-                'file_path': os.path.join(lookup_table_dirname, emb_var.name)
-            })
-        global_block.append_op(type='delete_var', inputs={'X': sums})
-        executor.run(convert_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if program:
-        if not isinstance(program, Program):
-            raise ValueError("program must be an instance of fluid.Program")
-    else:
-        local_model = os.path.join(dirname, model_filename)
-
-        with open(local_model, "rb") as f:
-            program_desc_str = f.read()
-
-        program = Program.parse_from_string(program_desc_str)
-
-        if not core._is_program_version_supported(program._version()):
-            raise ValueError("Unsupported program version: %d\n" %
-                             program._version())
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
-    _load_lookup_table_vars(executor, dirname, program, [lookup_table_var_name])
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    return program
-
-
-def get_inference_model(main_program, feeded_var_names, target_vars):
-    """
-    Prune the given `main_program` to build a new program especially for inference with distributed lookup table ,
-    and then add `feeded_vars` and `target_vars` in this program.
-
-    Args:
-        main_program(Program|None): The original program, which will be pruned to
-                                    build the inference model. If is set None,
-                                    the default main program will be used.
-                                    Default: None.
-        feeded_var_names(list[str]): Names of variables that need to be fed data
-                                     during inference.
-        target_vars(list[Variable]): Variables from which we can get inference
-                                     results.
-    Returns:
-        program(Program)
-
-    Raises:
-        ValueError: If `feed_var_names` is not a list of basestring.
-        ValueError: If `target_vars` is not a list of Variable.
-
-    """
-
-    def prepend_feed_ops(inference_program,
-                         feed_target_names,
-                         feed_holder_name='feed'):
-        if len(feed_target_names) == 0:
-            return
-
-        global_block = inference_program.global_block()
-
-        feed_var = global_block.create_var(
-            name=feed_holder_name,
-            type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
-
-        for i, name in enumerate(feed_target_names):
-            out = global_block.var(name)
-            global_block._prepend_op(
-                type='feed',
-                inputs={'X': [feed_var]},
-                outputs={'Out': [out]},
-                attrs={'col': i})
-
-    def append_fetch_ops(inference_program,
-                         fetch_target_names,
-                         fetch_holder_name='fetch'):
-        global_block = inference_program.global_block()
-        fetch_var = global_block.create_var(
-            name=fetch_holder_name,
-            type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
-
-        for i, name in enumerate(fetch_target_names):
-            global_block.append_op(
-                type='fetch',
-                inputs={'X': [name]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
-
-    origin_program = main_program.clone()
-    main_program = main_program.clone()
-    global_block = main_program.global_block()
-
-    need_to_remove_op_index = []
-    for i, op in enumerate(global_block.ops):
-        op.desc.set_is_target(False)
-        if op.type == "feed" or op.type == "fetch":
-            need_to_remove_op_index.append(i)
-
-    for index in need_to_remove_op_index[::-1]:
-        global_block._remove_op(index)
-
-    main_program.desc.flush()
-
-    main_program = main_program._prune(targets=target_vars)
-    main_program = main_program._inference_optimize(prune_read_op=True)
-
-    fetch_var_names = [v.name for v in target_vars]
-
-    prepend_feed_ops(main_program, feeded_var_names)
-    append_fetch_ops(main_program, fetch_var_names)
-
-    return main_program
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 4c24eb3d7fcc8eb68c870ee704dd8eb4c91e5440..9e931ad40c57a511a18e67a669b8b06b34db57d8 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -37,7 +37,10 @@ if os.path.exists(current_path + os.sep + 'core_noavx.' + core_suffix):
 try:
     if os.name == 'nt':
         third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
-        os.environ['path'] = third_lib_path + ';' + os.environ['path']
+        # Will load shared library from 'path' on windows
+        os.environ[
+            'path'] = current_path + ';' + third_lib_path + ';' + os.environ[
+                'path']
         sys.path.insert(0, third_lib_path)
         # Note: from python3.8, PATH will not take effect
         # https://github.com/python/cpython/pull/12302
@@ -270,6 +273,10 @@ if avx_supported():
         from .core_avx import _load_static_dict
         from .core_avx import _save_dygraph_dict
         from .core_avx import _load_dygraph_dict
+        from .core_avx import _save_lod_tensor
+        from .core_avx import _load_lod_tensor
+        from .core_avx import _save_selected_rows
+        from .core_avx import _load_selected_rows
         from .core_avx import _create_loaded_parameter
         from .core_avx import _cuda_synchronize
         from .core_avx import _promote_types_if_complex_exists
@@ -279,6 +286,7 @@ if avx_supported():
             from .core_avx import _set_process_signal_handler
             from .core_avx import _throw_error_if_process_failed
             from .core_avx import _convert_to_tensor_list
+            from .core_avx import _array_to_share_memory_tensor
             from .core_avx import _cleanup_mmap_fds
             from .core_avx import _remove_tensor_list_mmap_fds
     except Exception as e:
@@ -293,7 +301,7 @@ if avx_supported():
                 "WARNING: AVX is supported on local machine, but you have installed "
                 "paddlepaddle without avx core. Hence, no_avx core which has worse "
                 "preformance will be imported.\nYou could reinstall paddlepaddle by "
-                "'python -m pip install -U paddlepaddle-gpu[==version]' or rebuild "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' or rebuild "
                 "paddlepaddle WITH_AVX=ON to get better performance.\n"
                 "The original error is: %s\n" % cpt.get_exception_message(e))
             load_noavx = True
@@ -324,6 +332,10 @@ if load_noavx:
         from .core_noavx import _load_static_dict
         from .core_noavx import _save_dygraph_dict
         from .core_noavx import _load_dygraph_dict
+        from .core_noavx import _save_lod_tensor
+        from .core_noavx import _load_lod_tensor
+        from .core_noavx import _save_selected_rows
+        from .core_noavx import _load_selected_rows
         from .core_noavx import _create_loaded_parameter
         from .core_noavx import _cuda_synchronize
         from .core_noavx import _promote_types_if_complex_exists
@@ -333,6 +345,7 @@ if load_noavx:
             from .core_noavx import _set_process_signal_handler
             from .core_noavx import _throw_error_if_process_failed
             from .core_noavx import _convert_to_tensor_list
+            from .core_noavx import _array_to_share_memory_tensor
             from .core_noavx import _cleanup_mmap_fds
             from .core_noavx import _remove_tensor_list_mmap_fds
     except Exception as e:
@@ -340,12 +353,19 @@ if load_noavx:
             sys.stderr.write(
                 'Error: Can not import noavx core while this file exists: ' +
                 current_path + os.sep + 'core_noavx.' + core_suffix + '\n')
+        elif avx_supported():
+            sys.stderr.write(
+                "Error: AVX is support on your machine, but you have installed "
+                "paddlepaddle without avx core, you should reinstall paddlepaddle by "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]\n"
+            )
         else:
             sys.stderr.write(
                 "Error: AVX is not support on your machine, but you have installed "
-                "paddlepaddle with avx core, you should reinstall paddlepaddle by "
-                "'python -m pip install -U paddlepaddle-gpu[==version] -f "
-                "https://paddlepaddle.org.cn/whl/stable_noavx.html'\n")
+                "paddlepaddle without no_avx core, you should reinstall paddlepaddle by "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version] -f "
+                "https://paddlepaddle.org.cn/whl/mkl/stable/noavx.html or "
+                "https://paddlepaddle.org.cn/whl/openblas/stable/noavx.html\n")
         raise e
 
 
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index b2db00296bf95dff39e819595b71070ef1ef298f..52be7493cf229becc24d6b83ef140e0708f479e0 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -29,6 +29,7 @@ __all__ = ['DataFeeder']
 _PADDLE_DTYPE_2_NUMPY_DTYPE = {
     core.VarDesc.VarType.BOOL: 'bool',
     core.VarDesc.VarType.FP16: 'float16',
+    core.VarDesc.VarType.BF16: 'uint16',
     core.VarDesc.VarType.FP32: 'float32',
     core.VarDesc.VarType.FP64: 'float64',
     core.VarDesc.VarType.INT8: 'int8',
@@ -47,16 +48,18 @@ def convert_dtype(dtype):
             return _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype]
     elif isinstance(dtype, type):
         if dtype in [
-                np.bool, np.float16, np.float32, np.float64, np.int8, np.int16,
-                np.int32, np.int64, np.uint8, np.complex64, np.complex128
+                np.bool, np.float16, np.uint16, np.float32, np.float64, np.int8,
+                np.int16, np.int32, np.int64, np.uint8, np.complex64,
+                np.complex128
         ]:
             return dtype.__name__
     else:
         if dtype in [
-                'bool', 'float16', 'float32', 'float64', 'int8', 'int16',
-                'int32', 'int64', 'uint8', 'complex64', 'complex128', u'bool',
-                u'float16', u'float32', u'float64', u'int8', u'int16', u'int32',
-                u'int64', u'uint8', u'complex64', u'complex128'
+                'bool', 'float16', 'uint16', 'float32', 'float64', 'int8',
+                'int16', 'int32', 'int64', 'uint8', 'complex64', 'complex128',
+                u'bool', u'float16', u'uint16', u'float32', u'float64', u'int8',
+                u'int16', u'int32', u'int64', u'uint8', u'complex64',
+                u'complex128'
         ]:
             # this code is a little bit dangerous, since error could happen
             # when casting no-ascii code to str in python2.
@@ -66,7 +69,7 @@ def convert_dtype(dtype):
             return str(dtype)
 
     raise TypeError(
-        "dtype must be any of [bool, float16, float32, float64, int8, int16, "
+        "dtype must be any of [bool, float16, uint16, float32, float64, int8, int16, "
         "int32, int64, uint8, complex64, complex128], but received %s" % dtype)
 
 
@@ -123,6 +126,12 @@ def check_dtype(input_dtype,
         warnings.warn(
             "The data type of '%s' in %s only support float16 in GPU now. %s" %
             (input_name, op_name, extra_message))
+    if convert_dtype(input_dtype) in ['uint16'] and op_name not in [
+            'reshape', 'lookup_table', 'scale'
+    ]:
+        warnings.warn(
+            "The data type of '%s' in %s only support bfloat16 in OneDNN now. %s"
+            % (input_name, op_name, extra_message))
     if convert_dtype(input_dtype) not in expected_dtype:
         raise TypeError(
             "The data type of '%s' in %s must be %s, but received %s. %s" %
diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e90b308b393ed04b295eb80ab6272c12f807391
--- /dev/null
+++ b/python/paddle/fluid/dataloader/collate.py
@@ -0,0 +1,112 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numbers
+import numpy as np
+from ..framework import in_dygraph_mode
+from .. import core, layers
+
+try:
+    from collections.abc import Sequence, Mapping
+except:
+    from collections import Sequence, Mapping
+
+
+def default_collate_fn(batch):
+    """
+    Default batch collating function for :code:`paddle.io.DataLoader`,
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array and paddle.Tensor, this
+    function will parse input data recursively and stack number,
+    numpy array and paddle.Tensor datas as batch datas. e.g. for
+    following input data:
+
+    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
+    
+    
+    This default collate function zipped each number and numpy array
+    field together and stack each field as the batch field as follows:
+
+    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
+
+
+    Args:  
+        batch(list of sample data): batch should be a list of sample data.
+    
+    Returns:
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
+    """
+    sample = batch[0]
+    if isinstance(sample, np.ndarray):
+        batch = np.stack(batch, axis=0)
+        return batch
+    elif isinstance(sample, paddle.Tensor):
+        return layers.stack(batch, axis=0)
+    elif isinstance(sample, numbers.Number):
+        batch = np.array(batch)
+        return batch
+    elif isinstance(sample, (str, bytes)):
+        return batch
+    elif isinstance(sample, Mapping):
+        return {
+            key: default_collate_fn([d[key] for d in batch])
+            for key in sample
+        }
+    elif isinstance(sample, Sequence):
+        sample_fields_num = len(sample)
+        if not all(len(sample) == sample_fields_num for sample in iter(batch)):
+            raise RuntimeError(
+                "fileds number not same among samples in a batch")
+        return [default_collate_fn(fields) for fields in zip(*batch)]
+
+    raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
+                    "dict, list, number, but got {}".format(type(sample)))
+    return outputs
+
+
+def default_convert_fn(batch):
+    """
+    Default batch converting function for :code:`paddle.io.DataLoader`.
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array and paddle.Tensor.
+
+    .. note::
+        This function is default :attr:`collate_fn` in **Distable
+        automatic batching** mode, for **Distable automatic batching**
+        mode, please ses :attr:`paddle.io.DataLoader`
+
+    Args:  
+        batch(list of sample data): batch should be a list of sample data.
+    
+    Returns:
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
+    """
+    if isinstance(batch, (paddle.Tensor, np.ndarray)):
+        return batch
+    elif isinstance(batch, (str, bytes)):
+        return batch
+    elif isinstance(batch, Mapping):
+        return {key: default_convert_fn(batch[key]) for key in batch}
+    elif isinstance(batch, Sequence):
+        return [default_convert_fn(d) for d in batch]
+    else:
+        return batch
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 0dd2420691aea9ddcb79159945abffe629d590b5..52ab83698592ab65e820b4bdf1f717667ded12c9 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -35,181 +35,16 @@ else:
 import paddle
 from .. import core, layers
 from ..framework import in_dygraph_mode
-from ..multiprocess_utils import CleanupFuncRegistrar, _cleanup_mmap, _set_SIGCHLD_handler
+from ..multiprocess_utils import _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL, CleanupFuncRegistrar
 from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
 from .batch_sampler import _InfiniteIterableSampler
+from .collate import default_collate_fn, default_convert_fn
+from .worker import ParentWatchDog, get_worker_info, _worker_loop, \
+        _DatasetKind, _IterableDatasetStopIteration, _WorkerException
+from .flat import _flatten_batch, _restore_batch
 
 __all__ = ['get_worker_info']
 
-# multi-process worker check indices queue interval, avoid
-# hanging in subprocess data loading
-MP_INDICES_CHECK_INTERVAL = 5
-
-_IterableDatasetStopIteration = namedtuple('_IterableDatasetStopIteration',
-                                           ['worker_id'])
-
-
-def default_collate_fn(batch):
-    """
-    Default batch collating function for :code:`fluid.io.DataLoader`,
-    batch should be a list of samples, and each sample should be a list
-    of fields as follows:
-    
-    [[filed1, filed2, ...], [filed1, filed2, ...], ...]
-    
-    This default collate function zipped each filed together and stack
-    each filed as the batch field as follows:
-
-    [batch_filed1, batch_filed2, ...]
-
-    Args:  
-        batch(list of list of numpy array): the batch data, each fields
-              should be a numpy array, each sample should be a list of
-              fileds, and batch should be a list of sample.
-    
-    Returns:
-        a list of numpy array: collated batch
-    """
-    sample = batch[0]
-    # dataset has only 1 field
-    if isinstance(sample, np.ndarray):
-        return [np.stack(batch, axis=0)]
-
-    # batch each field
-    slots = []
-    for items in batch:
-        for i, item in enumerate(items):
-            if len(slots) < len(items):
-                slots.append([item])
-            else:
-                slots[i].append(item)
-
-    outputs = []
-    for slot in slots:
-        if isinstance(slot[0], (np.ndarray, np.bool, numbers.Number)):
-            tmp = np.stack(slot, axis=0)
-            outputs.append(tmp)
-        elif isinstance(slot[0], paddle.Tensor):
-            tmp = layers.stack(slot, axis=0)
-            outputs.append(tmp)
-        else:
-            raise RuntimeError("Unknown data type {}".format(type(slot[0])))
-    return outputs
-
-
-class _DatasetKind(object):
-    MAP = 0
-    ITER = 1
-
-    @staticmethod
-    def create_fetcher(kind, dataset, auto_collate_batch, collate_fn,
-                       drop_last):
-        if kind == _DatasetKind.MAP:
-            return _MapDatasetFetcher(dataset, auto_collate_batch, collate_fn,
-                                      drop_last)
-        elif kind == _DatasetKind.ITER:
-            return _IterableDatasetFetcher(dataset, auto_collate_batch,
-                                           collate_fn, drop_last)
-        else:
-            raise NotImplementedError("unknown Dataset kind {}".format(kind))
-
-
-class ParentWatchDog(object):
-    def __init__(self):
-        self._parent_pid = os.getppid()
-        self._parent_alive = True
-
-    def is_alive(self):
-        if self._parent_alive:
-            self._parent_alive = os.getppid() == self._parent_pid
-        return self._parent_alive
-
-
-# worker information for each workers, used for splitting data copy
-# for IteratorDataset in worker processes.
-_worker_info = None
-
-
-def get_worker_info():
-    """
-    Get DataLoader worker process information function, this function is
-    used to split data copy in worker process for IterableDataset
-    (see :code:`paddle.io.IterableDataset`), worker information contains
-    following fields:
-
-    :attr:`num_workers`: total worker process number, see `paddle.io.DataLoader`
-
-    :attr:`id`: the worker processs id, count from 0 to :attr:`num_workers - 1`
-
-    :attr:`dataset`: the dataset object in this worker process
-
-    Returns:
-        WorkerInfo: an instance of WorkerInfo which contains fields above.
-
-    .. note::
-        For mode usage and exampls, please see :code:`paddle.io.IterableDataset`
-
-    Example:
-
-        .. code-block:: python
-
-            import math
-            import paddle
-            import numpy as np
-            from paddle.io import IterableDataset, DataLoader, get_worker_info
-
-            class SplitedIterableDataset(IterableDataset):
-                def __init__(self, start, end):
-                    self.start = start
-                    self.end = end
-
-                def __iter__(self):
-                    worker_info = get_worker_info()
-                    if worker_info is None:
-                        iter_start = self.start
-                        iter_end = self.end
-                    else:
-                        per_worker = int(
-                            math.ceil((self.end - self.start) / float(
-                                worker_info.num_workers)))
-                        worker_id = worker_info.id
-                        iter_start = self.start + worker_id * per_worker
-                        iter_end = min(iter_start + per_worker, self.end)
-
-                    for i in range(iter_start, iter_end):
-                        yield np.array([i])
-
-            place = paddle.CPUPlace()
-            dataset = SplitedIterableDataset(start=2, end=9)
-            dataloader = DataLoader(
-                dataset,
-                places=place,
-                num_workers=2,
-                batch_size=1,
-                drop_last=True)
-
-            for data in dataloader:
-                print(data)
-            # outputs: [2, 5, 3, 6, 4, 7]
-
-    """
-    return _worker_info
-
-
-class WorkerInfo(object):
-    __initialized = False
-
-    def __init__(self, **kwargs):
-        for k, v in kwargs.items():
-            setattr(self, k, v)
-        self.__initialized = True
-
-    def __setattr__(self, key, val):
-        if self.__initialized:
-            raise RuntimeError("Cannot assign attributes to {} objects".format(
-                self.__class__.__name__))
-        return super(WorkerInfo, self).__setattr__(key, val)
-
 
 class _DataLoaderIterBase(object):
     """
@@ -230,7 +65,7 @@ class _DataLoaderIterBase(object):
         self._num_workers = loader.num_workers
         self._use_buffer_reader = loader.use_buffer_reader
         self._use_shared_memory = loader.use_shared_memory
-        self._timeout = loader.timeout if loader.timeout > 0 else MP_INDICES_CHECK_INTERVAL
+        self._timeout = loader.timeout if loader.timeout > 0 else MP_STATUS_CHECK_INTERVAL
         self._worker_init_fn = loader.worker_init_fn
         self._dataset_kind = loader.dataset_kind
         self._pin_memory = loader.pin_memory
@@ -244,7 +79,7 @@ class _DataLoaderIterBase(object):
             else:
                 self._sampler_iter = iter(
                     _InfiniteIterableSampler(self._dataset, 1))
-            self._collate_fn = loader.collate_fn
+            self._collate_fn = loader.collate_fn or default_convert_fn
 
         # LoDTensorBlockingQueue instance for create_py_reader and a thread
         # to put mini-batch data to self._blocking_queue, mini-batch data
@@ -275,6 +110,14 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
             self._dataset_kind, self._dataset, self._auto_collate_batch,
             self._collate_fn, True)
 
+        # NOTE: _structrue_infos used to record the data structure of
+        # batch to restore batch structure after reading Tensor
+        # from blocking_queue in single-process mode. Note that
+        # only single process is used in single-process mode, we
+        # can record the data structure sequencely in a list without
+        # recording the send and recv index
+        self._structure_infos = []
+
         # NOTE: len(self._places) batch data compose as an output
         # iteration, set blocking_queue can cache 2 iteration datas
         # at most here
@@ -282,6 +125,13 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
 
         self._init_thread()
 
+        # if user exit python program when dataloader is still
+        # iterating, resource may no release safely, so we
+        # add __del__ function to to CleanupFuncRegistrar
+        # to make sure __del__ is always called when program
+        # exit for resoure releasing safely
+        CleanupFuncRegistrar.register(self.__del__)
+
     def _init_thread(self):
         self._var_names = [v.name for v in self._feed_list]
         self._shapes = [v.shape for v in self._feed_list]
@@ -316,16 +166,16 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
                 # read data from dataset in mini-batch
                 batch = self._dataset_fetcher.fetch(indices)
 
+                # flat batch and record structure infos
+                batch, structure = _flatten_batch(batch)
+                self._structure_infos.append(structure)
+
                 # pack as LoDTensorArray
                 array = core.LoDTensorArray()
                 for slot in batch:
-                    if not isinstance(slot, core.LoDTensor):
-                        # FIXME(dkp): blocking_queue only support
-                        #             core.LoDTensorArray as input now, read
-                        #             numpy data into a LoDTensorArray here,
-                        #             should support paddle.Tensor list later
-                        if isinstance(slot, paddle.Tensor):
-                            slot = slot.numpy()
+                    if isinstance(slot, paddle.Tensor):
+                        slot = slot.value().get_tensor()
+                    elif not isinstance(slot, core.LoDTensor):
                         tmp = core.LoDTensor()
                         tmp.set(slot, core.CPUPlace())
                         slot = tmp
@@ -335,35 +185,54 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
                 if not self._blocking_queue.push(array):
                     break
 
+                if self._thread_done_event.is_set():
+                    break
+
             self._blocking_queue.close()
-            self._thread = None
+            self._shutdown_thread()
         except StopIteration:
             self._blocking_queue.close()
         except Exception:
             self._blocking_queue.kill()
-            self._thread = None
+            self._shutdown_thread()
             logging.warning("DataLoader reader thread raised an exception.")
             six.reraise(*sys.exc_info())
 
     def __next__(self):
         try:
             if in_dygraph_mode():
-                return self._reader.read_next_var_list()
+                data = self._reader.read_next_var_list()
+                data = _restore_batch(data, self._structure_infos.pop(0))
             else:
                 if self._return_list:
+                    data = self._reader.read_next_list()
+                    data = [
+                        _restore_batch(d, s)
+                        for d, s in zip(data, self._structure_infos[:len(
+                            self._places)])
+                    ]
+                    self._structure_infos = self._structure_infos[len(
+                        self._places):]
                     # static graph organized data on multi-device with list, if
                     # place number is 1, there is only 1 device, extra the data
                     # from list for devices to be compatible with dygraph mode
                     if len(self._places) == 1:
-                        return self._reader.read_next_list()[0]
-                    else:
-                        return self._reader.read_next_list()
+                        data = data[0]
                 else:
-                    return self._reader.read_next()
+                    data = self._reader.read_next()
+
+            return data
         except StopIteration:
-            self._reader.reset()
+            self._reader.shutdown()
             six.reraise(*sys.exc_info())
 
+    def _shutdown_thread(self):
+        if self._thread:
+            self._thread_done_event.set()
+            if self._thread is not threading.current_thread():
+                self._thread.join()
+                self._thread = None
+
     # python2 compatibility
     def next(self):
         return self.__next__()
@@ -373,97 +242,10 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
         # need to release thread resources on unexpected exit
         if self._blocking_queue:
             self._blocking_queue.close()
-
-
-# NOTE(chenweihang): _worker_loop must be top level method to be pickled
-def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
-                 auto_collate_batch, collate_fn, init_fn, worker_id,
-                 num_workers, use_shared_memory):
-    try:
-        # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
-        # some shared memory objects may have been applied for but have not yet
-        # been put into the inter-process Queue. This part of the object needs
-        # to be cleaned up when the process ends.
-        CleanupFuncRegistrar.register(_cleanup_mmap)
-
-        # set signal handler
-        core._set_process_signal_handler()
-
-        global _worker_info
-        _worker_info = WorkerInfo(
-            id=worker_id, num_workers=num_workers, dataset=dataset)
-
-        init_exception = None
-        try:
-            if init_fn is not None:
-                init_fn(worker_id)
-            fetcher = _DatasetKind.create_fetcher(
-                dataset_kind, dataset, auto_collate_batch, collate_fn, True)
-        except:
-            init_exception = Exception("init_fn failed in worker {}: " \
-                                    "{}".format(worker_id, sys.exc_info()))
-
-        iterator_drained = False
-        parent_watch_dog = ParentWatchDog()
-
-        while parent_watch_dog.is_alive():
-            try:
-                data = indices_queue.get(MP_INDICES_CHECK_INTERVAL)
-            except queue.Empty:
-                continue
-
-            # None as poison piil, so worker event should be set
-            if data is None:
-                assert done_event.is_set() or iterator_drained, \
-                        "get None when worker done_event set"
-                break
-            # If worker done event is set but get still get data in
-            # indices_queue, remaining data should be get and skipped.
-            if done_event.is_set() or iterator_drained:
-                continue
-
-            idx, indices = data
-            try:
-                if init_exception is not None:
-                    batch = init_exception
-                    init_exception = None
-                else:
-                    batch = fetcher.fetch(indices)
-            except Exception as e:
-                if isinstance(
-                        e, StopIteration) and dataset_kind == _DatasetKind.ITER:
-                    out_queue.put(_IterableDatasetStopIteration(worker_id))
-                    iterator_drained = True
-                else:
-                    out_queue.put((idx, e))
-            else:
-                if use_shared_memory:
-                    # FIXME(dkp): _convert_to_tensor_list only support np.array
-                    #             list now, should support paddle.Tensor list
-                    new_batch = []
-                    for sample in batch:
-                        new_sample = []
-                        for s in sample:
-                            if isinstance(s, paddle.Tensor):
-                                new_sample.append(s.numpy())
-                            else:
-                                new_sample.append(s)
-                        new_batch.append(new_sample)
-                    batch = new_batch
-
-                    tensor_list = core._convert_to_tensor_list(batch)
-                    out_queue.put((idx, tensor_list))
-                    core._remove_tensor_list_mmap_fds(tensor_list)
-                else:
-                    out_queue.put((idx, batch))
-    except KeyboardInterrupt:
-        # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
-        pass
-    except:
-        six.reraise(*sys.exc_info())
-    finally:
-        if use_shared_memory:
-            _cleanup_mmap()
+        # NOTE: blocking queue should be closed firstly for
+        # blocking queue read may hang and _thread_done_event
+        # cannot be checked
+        self._shutdown_thread()
 
 
 class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
@@ -483,6 +265,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._rcvd_idx = 0
         self._batches_outstanding = 0
         self._task_infos = {}
+        self._structure_infos = []
 
         # indices outstand as _outstanding_capacity at first, and
         # blocking_queue capacity is also _outstanding_capacity.
@@ -504,6 +287,13 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
         self._init_thread()
         self._shutdown = False
 
+        # if user exit python program when dataloader is still
+        # iterating, resource may no release safely, so we
+        # add __del__ function to to CleanupFuncRegistrar
+        # to make sure __del__ is always called when program
+        # exit for resoure releasing safely
+        CleanupFuncRegistrar.register(self.__del__)
+
     def _init_workers(self):
         # multiprocess worker and indice queue list initial as empty
         self._workers = []
@@ -617,8 +407,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             if not self._thread_done_event.is_set():
                 if batch is None:
                     self._exit_thread_expectedly()
-                elif isinstance(batch, Exception):
-                    self._exit_thread_unexpectedly()
                 else:
                     try:
                         # pack as LoDTensorArray
@@ -630,7 +418,9 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                             # LoDTensor not in shared memory is not
                             # serializable, cannot be create in workers
                             for slot in batch:
-                                if not isinstance(slot, core.LoDTensor):
+                                if isinstance(slot, paddle.Tensor):
+                                    slot = slot.value().get_tensor()
+                                elif not isinstance(slot, core.LoDTensor):
                                     tmp = core.LoDTensor()
                                     tmp.set(slot, core.CPUPlace())
                                     slot = tmp
@@ -654,8 +444,9 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             # batch indices and increase _rcvd_idx
             if self._dataset_kind == _DatasetKind.ITER:
                 while self._rcvd_idx < self._send_idx:
+                    sys.stdout.flush()
                     info = self._task_infos[self._rcvd_idx]
-                    if len(info) == 2 or self._worker_status[info[0]]:
+                    if len(info) == 3 or self._worker_status[info[0]]:
                         break
                     del self._task_infos[self._rcvd_idx]
                     self._rcvd_idx += 1
@@ -669,13 +460,15 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                     continue
 
             if self._rcvd_idx in self._task_infos and \
-                    len(self._task_infos[self._rcvd_idx]) == 2:
-                return self._task_infos.pop(self._rcvd_idx)[1]
+                    len(self._task_infos[self._rcvd_idx]) == 3:
+                info = self._task_infos.pop(self._rcvd_idx)
+                self._structure_infos.append(info[2])
+                return info[1]
 
             try:
                 # [ avoid hang ]: main process may blocking at _reader.read_next when
                 # KeyboardInterrupt, we do following tradeoff:
-                # 1. get data with timeout, MP_INDICES_CHECK_INTERVAL(5s) as timeout
+                # 1. get data with timeout, MP_STATUS_CHECK_INTERVAL(5s) as timeout
                 #    default, if KeyboardInterrupt blocking, failed workers will be
                 #    checked and raise RuntimeError to quit DataLoader in timeout
                 #    exception handling.
@@ -721,12 +514,17 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
                     self._try_put_indices()
                     continue
 
-                idx, batch = data
+                idx, batch, structure = data
+                if isinstance(batch, _WorkerException):
+                    self._exit_thread_unexpectedly()
+                    batch.reraise()
+
                 if idx == self._rcvd_idx:
                     del self._task_infos[idx]
+                    self._structure_infos.append(structure)
                     return batch
                 else:
-                    self._task_infos[idx] += (batch, )
+                    self._task_infos[idx] += (batch, structure)
                     continue
 
     def _try_put_indices(self):
@@ -777,9 +575,17 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
 
             if in_dygraph_mode():
                 data = self._reader.read_next_var_list()
+                data = _restore_batch(data, self._structure_infos.pop(0))
             else:
                 if self._return_list:
                     data = self._reader.read_next_list()
+                    data = [
+                        _restore_batch(d, s)
+                        for d, s in zip(data, self._structure_infos[:len(
+                            self._places)])
+                    ]
+                    self._structure_infos = self._structure_infos[len(
+                        self._places):]
                     # static graph organized data on multi-device with list, if
                     # place number is 1, there is only 1 device, extra the data
                     # from list for devices to be compatible with dygraph mode
@@ -790,7 +596,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
             self._on_output_batch()
             return data
         except StopIteration:
-            self._reader.reset()
+            self._reader.shutdown()
             self._try_shutdown_all()
             six.reraise(*sys.exc_info())
 
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index ac90cbafe17314747625b959353f1d77463f9180..3578e27cf02af1cabc32ed07e9b72651d7c03e12 100755
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -14,8 +14,8 @@
 
 from __future__ import print_function
 
+import paddle
 from .. import framework
-import paddle.dataset.common
 
 __all__ = [
     "Dataset", "IterableDataset", "TensorDataset", "ComposeDataset",
@@ -97,10 +97,10 @@ class IterableDataset(Dataset):
         .. code-block:: python
 
             import numpy as np
-            from paddle.io import Dataset
+            from paddle.io import IterableDataset
             
             # define a random dataset
-            class RandomDataset(Dataset):
+            class RandomDataset(IterableDataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
             
@@ -233,7 +233,7 @@ class TensorDataset(Dataset):
     each sample by indexing tensors in the 1st dimension.
 
     Args:
-        tensors(list of Tensor): tensors with same shape in the 1st dimension.
+        tensors(list|tuple): A list/tuple of tensors with same shape in the 1st dimension.
 
     Returns:
         Dataset: a Dataset instance wrapping tensors.
diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py
index 9382a7042237043dba8b41518d23a14ce4a43049..41e12fbc68ec16f004292950dfa10a950e5ec10b 100644
--- a/python/paddle/fluid/dataloader/fetcher.py
+++ b/python/paddle/fluid/dataloader/fetcher.py
@@ -27,8 +27,8 @@ class _DatasetFetcher(object):
 
 class _IterableDatasetFetcher(_DatasetFetcher):
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
-        super(_IterableDatasetFetcher, self).__init__(dataset, auto_collate_batch,
-                                              collate_fn, drop_last)
+        super(_IterableDatasetFetcher, self).__init__(
+            dataset, auto_collate_batch, collate_fn, drop_last)
         self.dataset_iter = iter(dataset)
 
     def fetch(self, batch_indices):
@@ -53,7 +53,8 @@ class _IterableDatasetFetcher(_DatasetFetcher):
 
 class _MapDatasetFetcher(_DatasetFetcher):
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
-        super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch, collate_fn, drop_last)
+        super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch,
+                                                 collate_fn, drop_last)
 
     def fetch(self, batch_indices):
         if self.auto_collate_batch:
diff --git a/python/paddle/fluid/dataloader/flat.py b/python/paddle/fluid/dataloader/flat.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3a725ece01c2508059458d542aedda3db228e9
--- /dev/null
+++ b/python/paddle/fluid/dataloader/flat.py
@@ -0,0 +1,142 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numbers
+import numpy as np
+
+try:
+    from collections.abc import Sequence, Mapping
+except:
+    from collections import Sequence, Mapping
+
+FIELD_PREFIX = "_paddle_field_"
+
+
+def _flatten_batch(batch):
+    """
+    For lod_blocking_queue only receive tensor array, flatten batch
+    data, extract numpy.array data out as a list of numpy.array to
+    send to lod_blocking_queue, and save the batch data structure
+    such as fields in other types (str, int, etc) or key-value map
+    of dictionaries
+    """
+
+    def _flatten(batch, flat_batch, structure, field_idx):
+        if isinstance(batch, Sequence):
+            for field in batch:
+                if isinstance(field, (np.ndarray, paddle.Tensor)):
+                    structure.append('{}{}'.format(FIELD_PREFIX, field_idx))
+                    flat_batch.append(field)
+                    field_idx += 1
+                elif isinstance(field, (str, bytes, numbers.Number)):
+                    structure.append(field)
+                elif isinstance(field, Sequence):
+                    field_struct, field_idx = _flatten(field, flat_batch, [],
+                                                       field_idx)
+                    structure.append(field_struct)
+                elif isinstance(field, Mapping):
+                    field_struct, field_idx = _flatten(field, flat_batch, {},
+                                                       field_idx)
+                    structure.append(field_struct)
+                else:
+                    structure.append(field)
+        elif isinstance(batch, Mapping):
+            for k, field in batch.items():
+                if isinstance(field, (np.ndarray, paddle.Tensor)):
+                    structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx)
+                    flat_batch.append(field)
+                    field_idx += 1
+                elif isinstance(field, (str, bytes, numbers.Number)):
+                    structure[k] = field
+                elif isinstance(field, Sequence):
+                    field_struct, field_idx = _flatten(field, flat_batch, [],
+                                                       field_idx)
+                    structure[k] = field_struct
+                elif isinstance(field, Mapping):
+                    field_struct, field_idx = _flatten(field, flat_batch, {},
+                                                       field_idx)
+                    structure[k] = field_struct
+                else:
+                    structure[k] = field
+        else:
+            raise TypeError("wrong flat data type: {}".format(type(batch)))
+
+        return structure, field_idx
+
+    # sample only contains single fields
+    if not isinstance(batch, Sequence):
+        flat_batch = []
+        structure, _ = _flatten([batch], flat_batch, [], 0)
+        return flat_batch, structure[0]
+    flat_batch = []
+    structure, _ = _flatten(batch, flat_batch, [], 0)
+    return flat_batch, structure
+
+
+def _restore_batch(flat_batch, structure):
+    """
+    After reading list of Tensor data from lod_blocking_queue outputs,
+    use this function to restore the batch data structrue, replace
+    :attr:`_paddle_field_x` with data from flat_batch
+    """
+
+    def _restore(structure, field_idx):
+        if isinstance(structure, Sequence):
+            for i, field in enumerate(structure):
+                if isinstance(field, str) and field.startswith(FIELD_PREFIX):
+                    cur_field_idx = int(field.replace(FIELD_PREFIX, ''))
+                    field_idx = max(field_idx, cur_field_idx)
+                    assert flat_batch[cur_field_idx] is not None, \
+                                "flat_batch[{}] parsed repeatly"
+                    structure[i] = flat_batch[cur_field_idx]
+                    flat_batch[cur_field_idx] = None
+                elif isinstance(field, (str, bytes, numbers.Number)):
+                    continue
+                elif isinstance(field, (Sequence, Mapping)):
+                    field_idx = _restore(structure[i], field_idx)
+        elif isinstance(structure, Mapping):
+            for k, field in structure.items():
+                if isinstance(field, str) and field.startswith(FIELD_PREFIX):
+                    cur_field_idx = int(field.replace(FIELD_PREFIX, ''))
+                    field_idx = max(field_idx, cur_field_idx)
+                    assert flat_batch[cur_field_idx] is not None, \
+                                "flat_batch[{}] parsed repeatly"
+                    structure[k] = flat_batch[cur_field_idx]
+                    flat_batch[cur_field_idx] = None
+                elif isinstance(field, (str, bytes, numbers.Number)):
+                    continue
+                elif isinstance(field, (Sequence, Mapping)):
+                    field_idx = _restore(structure[k], field_idx)
+        else:
+            raise TypeError("wrong flat data type: {}".format(type(batch)))
+
+        return field_idx
+
+    assert isinstance(flat_batch, Sequence), \
+            "flat_batch is not a list or tuple"
+
+    # no np.array in dataset, no output tensor from blocking queue
+    # simply return structure
+    if len(flat_batch) == 0:
+        return structure
+
+    # sample only contains single fields
+    if isinstance(structure, (str, bytes)):
+        assert structure == '{}{}'.format(FIELD_PREFIX, 0), \
+                "invalid structure: {}".format(structure)
+        return flat_batch[0]
+    field_idx = _restore(structure, 0)
+    assert field_idx + 1 == len(flat_batch), "Tensor parse incomplete"
+    return structure
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
new file mode 100644
index 0000000000000000000000000000000000000000..26bd1f06e12e84a9da24fad76091de4e71d3add4
--- /dev/null
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -0,0 +1,257 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import six
+import sys
+import paddle
+import numpy as np
+import traceback
+from collections import namedtuple
+from .. import core
+from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
+from ..multiprocess_utils import _cleanup_mmap, CleanupFuncRegistrar, MP_STATUS_CHECK_INTERVAL
+from ..framework import in_dygraph_mode
+from .flat import _flatten_batch
+
+# NOTE: queue has a different name in python2 and python3
+if six.PY2:
+    import Queue as queue
+else:
+    import queue
+
+__all__ = ['get_worker_info']
+
+
+class _IterableDatasetStopIteration(object):
+    def __init__(self, worker_id):
+        self.worker_id = worker_id
+
+
+class _DatasetKind(object):
+    MAP = 0
+    ITER = 1
+
+    @staticmethod
+    def create_fetcher(kind, dataset, auto_collate_batch, collate_fn,
+                       drop_last):
+        if kind == _DatasetKind.MAP:
+            return _MapDatasetFetcher(dataset, auto_collate_batch, collate_fn,
+                                      drop_last)
+        elif kind == _DatasetKind.ITER:
+            return _IterableDatasetFetcher(dataset, auto_collate_batch,
+                                           collate_fn, drop_last)
+        else:
+            raise NotImplementedError("unknown Dataset kind {}".format(kind))
+
+
+class ParentWatchDog(object):
+    def __init__(self):
+        self._parent_pid = os.getppid()
+        self._parent_alive = True
+
+    def is_alive(self):
+        if self._parent_alive:
+            self._parent_alive = os.getppid() == self._parent_pid
+        return self._parent_alive
+
+
+# worker information for each workers, used for splitting data copy
+# for IteratorDataset in worker processes.
+_worker_info = None
+
+
+def get_worker_info():
+    """
+    Get DataLoader worker process information function, this function is
+    used to split data copy in worker process for IterableDataset
+    (see :code:`paddle.io.IterableDataset`), worker information contains
+    following fields:
+
+    :attr:`num_workers`: total worker process number, see `paddle.io.DataLoader`
+
+    :attr:`id`: the worker processs id, count from 0 to :attr:`num_workers - 1`
+
+    :attr:`dataset`: the dataset object in this worker process
+
+    Returns:
+        WorkerInfo: an instance of WorkerInfo which contains fields above.
+
+    .. note::
+        For more usage and examples, please see :code:`paddle.io.IterableDataset`
+
+    Example:
+
+        .. code-block:: python
+
+            import math
+            import paddle
+            import numpy as np
+            from paddle.io import IterableDataset, DataLoader, get_worker_info
+
+            class SplitedIterableDataset(IterableDataset):
+                def __init__(self, start, end):
+                    self.start = start
+                    self.end = end
+
+                def __iter__(self):
+                    worker_info = get_worker_info()
+                    if worker_info is None:
+                        iter_start = self.start
+                        iter_end = self.end
+                    else:
+                        per_worker = int(
+                            math.ceil((self.end - self.start) / float(
+                                worker_info.num_workers)))
+                        worker_id = worker_info.id
+                        iter_start = self.start + worker_id * per_worker
+                        iter_end = min(iter_start + per_worker, self.end)
+
+                    for i in range(iter_start, iter_end):
+                        yield np.array([i])
+
+            place = paddle.CPUPlace()
+            dataset = SplitedIterableDataset(start=2, end=9)
+            dataloader = DataLoader(
+                dataset,
+                places=place,
+                num_workers=2,
+                batch_size=1,
+                drop_last=True)
+
+            for data in dataloader:
+                print(data)
+            # outputs: [2, 5, 3, 6, 4, 7]
+
+    """
+    return _worker_info
+
+
+class WorkerInfo(object):
+    __initialized = False
+
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        self.__initialized = True
+
+    def __setattr__(self, key, val):
+        if self.__initialized:
+            raise RuntimeError("Cannot assign attributes to {} objects".format(
+                self.__class__.__name__))
+        return super(WorkerInfo, self).__setattr__(key, val)
+
+
+class _WorkerException(object):
+    def __init__(self, worker_id, exc_info=None):
+        self.worker_id = worker_id
+        exc_info = exc_info or sys.exc_info()
+        self.exc_type = exc_info[0]
+        self.exc_msg = "".join(traceback.format_exception(*exc_info))
+
+    def reraise(self):
+        msg = "DataLoader worker({}) caught {} with message:\n{}".format(
+            self.worker_id, self.exc_type.__name__, self.exc_msg)
+        if getattr(self.exc_type, "message", None):
+            raise self.exc_type(message=msg)
+        raise self.exc_type(msg)
+
+
+def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
+                 auto_collate_batch, collate_fn, init_fn, worker_id,
+                 num_workers, use_shared_memory):
+    try:
+        # NOTE: [ mmap files clear ] When the child process exits unexpectedly,
+        # some shared memory objects may have been applied for but have not yet
+        # been put into the inter-process Queue. This part of the object needs
+        # to be cleaned up when the process ends.
+        CleanupFuncRegistrar.register(_cleanup_mmap)
+
+        # set signal handler
+        core._set_process_signal_handler()
+
+        global _worker_info
+        _worker_info = WorkerInfo(
+            id=worker_id, num_workers=num_workers, dataset=dataset)
+
+        init_exception = None
+        try:
+            if init_fn is not None:
+                init_fn(worker_id)
+            fetcher = _DatasetKind.create_fetcher(
+                dataset_kind, dataset, auto_collate_batch, collate_fn, True)
+        except:
+            init_exception = _WorkerException(worker_id)
+
+        iterator_drained = False
+        parent_watch_dog = ParentWatchDog()
+
+        while parent_watch_dog.is_alive():
+            try:
+                data = indices_queue.get(MP_STATUS_CHECK_INTERVAL)
+            except queue.Empty:
+                continue
+
+            # None as poison piil, so worker event should be set
+            if data is None:
+                assert done_event.is_set() or iterator_drained, \
+                        "get None when worker done_event set"
+                break
+            # If worker done event is set but get still get data in
+            # indices_queue, remaining data should be get and skipped.
+            if done_event.is_set() or iterator_drained:
+                continue
+
+            idx, indices = data
+            try:
+                if init_exception is not None:
+                    batch = init_exception
+                    init_exception = None
+                else:
+                    # NOTE: GPU tensor operation is not supported in sub-process
+                    #       but default device is GPU in paddle-gpu version, which
+                    #       may copy CPU tensor to GPU even if users want to use
+                    #       CPU tensor operation, so we add CPUPlace guard here
+                    #       to make sure tensor will be operated only on CPU
+                    with paddle.fluid.dygraph.guard(place=paddle.CPUPlace()):
+                        batch = fetcher.fetch(indices)
+            except Exception as e:
+                if isinstance(
+                        e, StopIteration) and dataset_kind == _DatasetKind.ITER:
+                    out_queue.put(_IterableDatasetStopIteration(worker_id))
+                    iterator_drained = True
+                else:
+                    out_queue.put((idx, _WorkerException(worker_id), None))
+            else:
+                if isinstance(batch, _WorkerException):
+                    out_queue.put((idx, batch, None))
+                batch, structure = _flatten_batch(batch)
+                if use_shared_memory:
+                    tensor_list = [
+                        core._array_to_share_memory_tensor(b)
+                        if isinstance(b, np.ndarray) else b._share_memory()
+                        for b in batch
+                    ]
+                    out_queue.put((idx, tensor_list, structure))
+                    core._remove_tensor_list_mmap_fds(tensor_list)
+                else:
+                    out_queue.put((idx, batch, structure))
+    except KeyboardInterrupt:
+        # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process
+        pass
+    except:
+        six.reraise(*sys.exc_info())
+    finally:
+        if use_shared_memory:
+            _cleanup_mmap()
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 86c63ababbbfdbc9b7d07c95e37dda8c67d18d2f..b4cd3326ddec5f75fe93090fd2ef5ce12dc45771 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -74,6 +74,7 @@ class DatasetBase(object):
         self.dataset = core.Dataset("MultiSlotDataset")
         self.thread_num = 1
         self.filelist = []
+        self.use_ps_gpu = False
 
     def set_pipe_command(self, pipe_command):
         """
@@ -300,6 +301,15 @@ class DatasetBase(object):
         self.dataset.set_data_feed_desc(self.desc())
         self.dataset.create_readers()
 
+    def _set_use_ps_gpu(self, use_ps_gpu):
+        """
+        set use_ps_gpu flag
+
+        Args:
+            use_ps_gpu: bool
+        """
+        self.use_ps_gpu = use_ps_gpu
+
     def _finish_to_run(self):
         self.dataset.destroy_readers()
 
@@ -391,7 +401,10 @@ class InMemoryDataset(DatasetBase):
     )
     def _dynamic_adjust_before_train(self, thread_num):
         if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(thread_num, False)
+            if self.use_ps_gpu:
+                self.dataset.dynamic_adjust_channel_num(thread_num, True)
+            else:
+                self.dataset.dynamic_adjust_channel_num(thread_num, False)
         self.dataset.dynamic_adjust_readers_num(thread_num)
 
     @deprecated(
@@ -400,7 +413,10 @@ class InMemoryDataset(DatasetBase):
     )
     def _dynamic_adjust_after_train(self):
         if not self.is_user_set_queue_num:
-            self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
+            if self.use_ps_gpu:
+                self.dataset.dynamic_adjust_channel_num(self.thread_num, True)
+            else:
+                self.dataset.dynamic_adjust_channel_num(self.thread_num, False)
         self.dataset.dynamic_adjust_readers_num(self.thread_num)
 
     @deprecated(
diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py
index 9110b8daf38e16d61cb58ef173ed59d11d541aad..75dc14a1d754cf507fde1d3b2111289a76c6efa7 100644
--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -280,88 +280,3 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
             add_op_link_var(opn, var, True)
 
     graph(path, show=False)
-
-
-def prepare_fast_nan_inf_debug(_program):
-    """
-    Given a program to run, insert a (reduce) sum op for every var in that program.
-    Instead of checking all vars originally defined in the program,
-    only those inserted ops will be checked in the c++ end, to detect if it contains NAN or INF.
-    Thereforce, the speed of nan/inf checking could be improved.
-    Please set ``FLAGS_fast_check_nan_inf" to open the fast nan/inf feature.
-    """
-
-    helper = LayerHelper('reduce_sum', **locals())
-
-    if _program is None:
-        _program = default_main_program()
-
-    for _block in _program.blocks:
-        # fetch vars in the current block
-        _vars_in_prog = []
-        for _var_name in _block.vars:
-            _vars_in_prog.append((_var_name, _block.vars[_var_name]))
-
-        # append sum_op in the current block
-        for _var_name, _var in _vars_in_prog:
-
-            try:
-
-                if _var.dtype == -1:
-                    continue
-
-                ## create a var for holding sum output
-                _output_var = _block.create_var(
-                    name=unique_name.generate("debug_var_" + _var_name),
-                    dtype=_var.dtype,
-                    type=core.VarDesc.VarType.LOD_TENSOR,
-                    persistable=False,
-                    stop_gradient=True)
-
-                ## create a sum op, input each existing var in the block
-                _block.append_op(
-                    type='sum',
-                    outputs={'Out': _output_var},
-                    inputs={'X': [_var]})
-            except Exception as e:
-                pass
-
-
-def run_fast_nan_inf_debug(executor,
-                           program=None,
-                           feed=None,
-                           fetch_list=None,
-                           feed_var_name='feed',
-                           fetch_var_name='fetch',
-                           scope=None,
-                           return_numpy=True,
-                           use_program_cache=False,
-                           dump_core=True):
-    """
-    Run a program by the given executor. Catch the exception of NAN and INF, and save persistables into the dumped core.
-    """
-
-    assert (executor is not None)
-
-    try:
-        output = executor.run(program=program,
-                              feed=feed,
-                              fetch_list=fetch_list,
-                              feed_var_name=feed_var_name,
-                              fetch_var_name=fetch_var_name,
-                              scope=scope,
-                              return_numpy=return_numpy,
-                              use_program_cache=use_program_cache)
-
-        return output
-
-    except Exception as e:
-
-        print("catch an exception:")
-        print(e)
-
-        core_filename = "core" + str(int(random.random() * 10000)) + ".pdckpt"
-        io.save_persistables(
-            executor, "./", main_program=program, filename=core_filename)
-
-        print("dumping a core into ./%s" % core_filename)
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 838aea37f18344b257e2dd8a9063ebc7f7202152..7fed27ee45978a5b34d1235999875312c292fcd1 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -102,6 +102,12 @@ class Hogwild(DeviceWorker):
         # when opt_info is None or empty dict, it should return
         if not opt_info:
             return
+        downpour = trainer_desc.downpour_param
+        hogwild = trainer_desc.hogwild_param
+        if opt_info["stat_var_names"]:
+            for i in opt_info["stat_var_names"]:
+                hogwild.stat_var_names.extend([i])
+                downpour.stat_var_names.extend([i])
 
         from paddle.fluid.incubate.fleet.parameter_server import version
 
@@ -109,8 +115,6 @@ class Hogwild(DeviceWorker):
             return
 
         program_configs = opt_info["program_configs"]
-        downpour = trainer_desc.downpour_param
-        hogwild = trainer_desc.hogwild_param
 
         for pid in program_configs:
             if pid == program_id:
@@ -161,10 +165,6 @@ class Hogwild(DeviceWorker):
             sparse_table.emb_dim = -1
             # not use hard code click
             sparse_table.label_var_name = ""
-        if opt_info["stat_var_names"]:
-            for i in opt_info["stat_var_names"]:
-                hogwild.stat_var_names.extend([i])
-                downpour.stat_var_names.extend([i])
 
         for i in worker.get_desc().dense_table:
             if i.table_id in dense_table_set:
@@ -413,15 +413,30 @@ class Section(DeviceWorker):
         section_param = trainer_desc.section_param
         section_param.num_microbatches = pipeline_opt["num_microbatches"]
         section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"]
+        section_param.pipeline_stage = pipeline_opt["pipeline_stage"]
+        section_param.num_pipeline_stages = pipeline_opt["num_pipeline_stages"]
+        schedule_mode_str = pipeline_opt["schedule_mode"]
+        # F-then-B scheduler which runs Forward phase for all microbatches,
+        # then runs Backward phase for all microbatches.
+        # 1F1B scheduler, which runs forward phase and backward phase altertively
+        # after startup phase.
+        assert schedule_mode_str in ["F-then-B", "1F1B"], (
+            "The schedule mode "
+            "for pipeline must be one of F-then-B or 1F1B")
+        schedule_mode = 0 if schedule_mode_str == "F-then-B" else 1
+        section_param.schedule_mode = schedule_mode
         cfg = section_param.section_config
         program = pipeline_opt["section_program"]
-        cfg.program_desc.ParseFromString(program["program"]._get_desc()
+        cfg.program_desc.ParseFromString(program._get_desc()
                                          .serialize_to_string())
         # TODO: why does not work
         # cfg.program_desc.CopyFrom(program.program._get_desc())
         place = pipeline_opt["place"]
         place_id = pipeline_opt["place_id"]
-        assert isinstance(place, core.CUDAPlace)
+        if core.is_compiled_with_cuda():
+            assert isinstance(place, core.CUDAPlace)
+        elif core.is_compiled_with_npu():
+            assert isinstance(place, core.NPUPlace)
         cfg.place = cfg.CUDAPlace
         cfg.place_id = place_id
 
diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py
index 61b2bcad01d5b1e43a8f5c47747ced0440c87d1e..42033a0ada4ac63b8b54af0a9b42739347c14371 100644
--- a/python/paddle/fluid/distributed/ps_instance.py
+++ b/python/paddle/fluid/distributed/ps_instance.py
@@ -24,7 +24,7 @@ class PaddlePSInstance(object):
             instance = PaddlePSInstance(1, 2)
     """
 
-    def __init__(self, server_worker_mode, proc_per_node):
+    def __init__(self, server_worker_mode=1, proc_per_node=2):
         self.dh = MPIHelper()
         self._rankid = self.dh.get_rank()
         self._server_worker_mode = server_worker_mode
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index cf270ced3b704179856b1ab04dbeae8a04fbc589..d66e33097833a53a9fbff06437816d4320be652e 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -58,6 +58,8 @@ from .amp import *
 
 from .math_op_patch import monkey_patch_math_varbase
 
+from .inplace_utils import inplace_apis_in_dygraph_only
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 08d58e0c808b835e65fb222f9adca872053e8dcd..be5d9ac58311b58ba9125ee067c8b5e6edd18a95 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -26,6 +26,7 @@ import logging
 from ..data_feeder import convert_dtype
 import warnings
 from ..framework import _get_paddle_place
+import paddle
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index dd04b1072040551d86856809f951e08672ac8816..c7ea412fec1b77cf0dd86c187250e8ac499a800b 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -29,7 +29,7 @@ class Sequential(Layer):
     The argument passed to the constructor can be iterable Layers or iterable name Layer pairs.
 
     Parameters:
-        *layers(tuple): Layers or iterable name Layer pairs.
+        layers(Layer|list|tuple): Layer or list/tuple of iterable name Layer pair.
 
     Examples:
         .. code-block:: python
@@ -59,7 +59,7 @@ class Sequential(Layer):
 
     def __init__(self, *layers):
         super(Sequential, self).__init__()
-        if len(layers) > 0 and isinstance(layers[0], tuple):
+        if len(layers) > 0 and isinstance(layers[0], (list, tuple)):
             for name, layer in layers:
                 self.add_sublayer(name, layer)
         else:
@@ -69,6 +69,8 @@ class Sequential(Layer):
     def __getitem__(self, name):
         if isinstance(name, slice):
             return self.__class__(*(list(self._sub_layers.values())[name]))
+        elif isinstance(name, str):
+            return self._sub_layers[name]
         else:
             if name >= len(self._sub_layers):
                 raise IndexError('index {} is out of range'.format(name))
@@ -213,13 +215,25 @@ class LayerList(Layer):
             for idx, layer in enumerate(sublayers):
                 self.add_sublayer(str(idx), layer)
 
+    def _get_abs_idx(self, idx):
+        if isinstance(idx, int):
+            if not (-len(self) <= idx < len(self)):
+                raise IndexError(
+                    'index {} is out of range, should be an integer in range [{}, {})'.
+                    format(idx, -len(self), len(self)))
+            if idx < 0:
+                idx += len(self)
+        return idx
+
     def __getitem__(self, idx):
         if isinstance(idx, slice):
             return self.__class__(list(self._sub_layers.values())[idx])
         else:
+            idx = self._get_abs_idx(idx)
             return self._sub_layers[str(idx)]
 
     def __setitem__(self, idx, sublayer):
+        idx = self._get_abs_idx(idx)
         return setattr(self, str(idx), sublayer)
 
     def __delitem__(self, idx):
@@ -227,6 +241,7 @@ class LayerList(Layer):
             for k in range(len(self._sub_layers))[idx]:
                 delattr(self, str(k))
         else:
+            idx = self._get_abs_idx(idx)
             delattr(self, str(idx))
         str_indices = [str(i) for i in range(len(self._sub_layers))]
         self._sub_layers = OrderedDict(
@@ -275,10 +290,15 @@ class LayerList(Layer):
                 another = paddle.nn.Linear(10, 10)
                 linears.insert(3, another)
                 print(linears[3] is another)  # True
+                another = paddle.nn.Linear(10, 10)
+                linears.insert(-1, another)
+                print(linears[-2] is another) # True
         """
         assert isinstance(index, int) and \
-               0 <= index < len(self._sub_layers), \
-            "index should be an integer in range [0, len(self))"
+               -len(self._sub_layers) <= index < len(self._sub_layers), \
+            "index should be an integer in range [{}, {})".format(-len(self), len(self))
+
+        index = self._get_abs_idx(index)
         for i in range(len(self._sub_layers), index, -1):
             self._sub_layers[str(i)] = self._sub_layers[str(i - 1)]
         self._sub_layers[str(index)] = sublayer
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
index 198c2920eec7fdc81c6a06b27c9ed64f9754ec75..5ea1fdfac0928ad465fc7e29813fe42182047c6a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
@@ -33,10 +33,11 @@ class BasicApiTransformer(gast.NodeTransformer):
         self.root = wrapper_root.node
         self.class_node_dict = {}
 
-        self.name_to_tensor_shape = {}
-
     def transform(self):
+        to_tensor_transformer = ToTensorTransformer(self.root)
+        to_tensor_transformer.transform()
         self.visit(self.root)
+
         return self.wrapper_root
 
     def visit_Assign(self, node):
@@ -62,11 +63,6 @@ class BasicApiTransformer(gast.NodeTransformer):
 
     def _visit_Call(self, node):
         assert isinstance(node, gast.Call)
-        # Replace API `to_variable` with `fluid.layers.assign`
-        if is_to_variable(node):
-            node = to_assign_node(node)
-            return node
-
         func_name = astor.to_source(gast.gast_to_ast(node.func))
 
         if self._is_dygraph_forward(func_name):
@@ -102,6 +98,29 @@ class BasicApiTransformer(gast.NodeTransformer):
         return False
 
 
+class ToTensorTransformer(gast.NodeTransformer):
+    """
+    Class to transform paddle.to_tensor and paddle.to_variable to paddle.assign
+    """
+
+    def __init__(self, node):
+        assert isinstance(
+            node, gast.AST
+        ), "Input non-gast.AST node for the initialization of ToTensorTransformer."
+        self.root = node
+
+    def transform(self):
+        self.visit(self.root)
+        return self.root
+
+    def visit_Call(self, node):
+        assert isinstance(node, gast.Call)
+        if is_to_variable(node):
+            node = to_assign_node(node)
+        self.generic_visit(node)
+        return node
+
+
 def is_to_variable(node):
     assert isinstance(node, gast.Call)
     api_name = utils.ast_to_source_code(node.func).strip()
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 403e77cb5ccd8d3faae999b36070d8be64322079..4126e942259434dc5035a48a7fd054a7b0433f98 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -302,19 +302,19 @@ def convert_var_shape_simple(x):
         return x.shape
 
 
-def eval_if_exist_else_none(name, local_symbol_table):
+def eval_if_exist_else_none(name, global_symbol_table):
     """
     Args:
         name([str]): Expression passed into `eval`.
-        local_symbol_table(dict): Specified from `locals()`. DO NOT use `globals()`,
-                                  it has a higher priority and will hide away variables
-                                  from `locals()`.
+        local_symbol_table(dict): Specified from `globals()`. DO NOT use `locals()`,
+                                  because all STATIC_CONVERT_VAR_SHAPE_SUFFIX vars is
+                                  declared with keyword `global`.
     
     Returns:
-        Return the variable if found in local_symbol_table else None.
+        Return the variable if found in global_symbol_table else None.
     """
     try:
-        return eval(name, local_symbol_table)
+        return eval(name, global_symbol_table)
     except:
         return None
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 79d24c05184713d2fff6005ab9bde25af0a27570..de788487feabc7f01b8c26bbd62e4d9a595a34fd 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -238,11 +238,16 @@ class NameVisitor(gast.NodeVisitor):
         return new_name_ids
 
     def _is_call_func_name_node(self, node):
+        white_func_names = set(['append', 'extend'])
         if len(self.ancestor_nodes) > 1:
             assert self.ancestor_nodes[-1] == node
             parent_node = self.ancestor_nodes[-2]
             if isinstance(parent_node, gast.Call) and parent_node.func == node:
-                return True
+                # e.g: var_list.append(elem), var_list is also a name_id.
+                should_skip = isinstance(
+                    node, gast.Attribute) and node.attr in white_func_names
+                if not should_skip:
+                    return True
         return False
 
     def _update_name_ids(self, new_name_ids):
@@ -398,10 +403,13 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
         ])
 
     def _vars_loaded_before_store(ids_dict):
+        """
+        gast.Param is also a kind of `load` semantic.
+        """
         new_dict = defaultdict(list)
         for k, ctxs in six.iteritems(ids_dict):
             for ctx in ctxs:
-                if isinstance(ctx, gast.Load):
+                if isinstance(ctx, (gast.Load, gast.Param)):
                     new_dict[k].append(ctx)
                 elif isinstance(ctx, gast.Store):
                     break
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index b7ef000938a1518592d06abaf3ae6da437acd9a8..14bb54983b524ad1c09aa0d66f37b2b2aae6dbe8 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -378,6 +378,21 @@ class NameVisitor(gast.NodeVisitor):
         :param loop_node: Current loop node.
         """
 
+        def filter_name_nodes_from(root_node, target_var_names):
+            """
+            Filter children with gast.Name type from node.(inclusivly)
+            """
+            name_nodes = set()
+            if isinstance(root_node, gast.Name):
+                if node.id in target_var_names:
+                    name_nodes.add(root_node)
+            for child_node in gast.walk(root_node):
+                if isinstance(child_node, gast.Name):
+                    if child_node.id in target_var_names:
+                        name_nodes.add(child_node)
+
+            return name_nodes
+
         vars_of_list_generator = set()
         target_vars_of_for_node = set()
 
@@ -412,15 +427,16 @@ class NameVisitor(gast.NodeVisitor):
 
                 # 1.2 vars from target vars used in elt_node
                 target_var_names = {var.id for var in target_vars}
-                listcomp_node = self._get_parent_node(parent_node)
-                elt_node = listcomp_node.elt
-                if isinstance(elt_node, gast.Name):
-                    if elt_node.id in target_var_names:
-                        vars_of_list_generator.add(elt_node)
-                for child_node in gast.walk(elt_node):
-                    if isinstance(child_node, gast.Name):
-                        if child_node.id in target_var_names:
-                            vars_of_list_generator.add(child_node)
+                comp_node = self._get_parent_node(parent_node)
+                elt_nodes = []
+                if isinstance(comp_node, gast.ListComp):
+                    elt_nodes.append(comp_node.elt)
+                elif isinstance(comp_node, gast.DictComp):
+                    elt_nodes.extend([comp_node.key, comp_node.value])
+
+                for node in elt_nodes:
+                    vars_of_list_generator |= filter_name_nodes_from(
+                        node, target_var_names)
 
             # 2. Get target vars or vars from target vars used in for-loop but the for-loop is
             #   1) not the "loop_node" itself
@@ -594,7 +610,7 @@ class LoopTransformer(gast.NodeTransformer):
         # append return values for loop body
         body_stmts.append(
             gast.Return(value=generate_name_node(
-                loop_var_names, ctx=gast.Load())))
+                loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True)))
         body_func_node = gast.FunctionDef(
             name=unique_name.generate(FOR_BODY_PREFIX),
             args=gast.arguments(
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index ffa1d65e6280af9e5c4d3eac8b29351c8177db69..eb53d7ec9bec894771afce2191dfe195fc53580d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -59,7 +59,7 @@ def create_convert_shape_node(var_shape_node,
 
 
 def create_choose_shape_node(attr_shape_name, api_shape_name, slice_node=None):
-    eval_exist_func = "paddle.jit.dy2static.eval_if_exist_else_none('{}', locals())".format(
+    eval_exist_func = "paddle.jit.dy2static.eval_if_exist_else_none('{}', globals())".format(
         api_shape_name)
     args = [attr_shape_name, eval_exist_func]
 
@@ -293,6 +293,10 @@ class TensorShapeTransformer(gast.NodeTransformer):
         return False
 
     def _update_name_to_var_shape(self, node):
+        def replace_dot(name):
+            # replace all '.' into '_'
+            return name.replace('.', '_')
+
         assert isinstance(node, gast.Assign)
         target_node = node.targets[0]
         value_node = node.value
@@ -307,7 +311,8 @@ class TensorShapeTransformer(gast.NodeTransformer):
                     if value_node.id in self.name_to_var_shape:
                         # TODO(zhhsplendid): is context a problem for the result node of gast.parse?
                         static_shape_var_name = unique_name.generate(
-                            target_id + STATIC_CONVERT_VAR_SHAPE_SUFFIX)
+                            replace_dot(target_id) +
+                            STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                         static_shape_var_node = gast.parse(
                             static_shape_var_name).body[0].value
 
@@ -328,7 +333,8 @@ class TensorShapeTransformer(gast.NodeTransformer):
                 if isinstance(value_node, gast.Attribute):
                     if self._is_var_shape(value_node):  # eg: x.shape
                         static_shape_var_name = unique_name.generate(
-                            target_id + STATIC_CONVERT_VAR_SHAPE_SUFFIX)
+                            replace_dot(target_id) +
+                            STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                         static_shape_var_node = gast.parse(
                             static_shape_var_name).body[0].value
 
@@ -341,6 +347,12 @@ class TensorShapeTransformer(gast.NodeTransformer):
                             ast_to_source_code(static_shape_value_node).strip(),
                             idx)
                         sub_node = gast.parse(sub_node_str).body[0].value
+                        # Note(Aurelius84): Becuase static_shape_var_name is used in
+                        # eval_if_exist_else_none() as plain string, so it will not 
+                        # be pasred as argument in convert_loop/ifelse. We delcare it
+                        # as global var because it has unique name.
+                        update_static_shape_var_node.append(
+                            gast.Global(names=[static_shape_var_name]))
 
                         update_static_shape_var_node.append(
                             gast.Assign(
@@ -354,7 +366,8 @@ class TensorShapeTransformer(gast.NodeTransformer):
             if isinstance(value_node, gast.Name):
                 if value_node.id in self.name_to_var_shape:
                     static_shape_var_name = unique_name.generate(
-                        target_id + STATIC_CONVERT_VAR_SHAPE_SUFFIX)
+                        replace_dot(target_id) +
+                        STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                     static_shape_var_node = gast.parse(
                         static_shape_var_name).body[0].value
                     static_shape_value_name = self.name_to_var_shape[
@@ -370,17 +383,20 @@ class TensorShapeTransformer(gast.NodeTransformer):
                     self.name_to_var_shape[target_id] = static_shape_var_name
             elif self._is_var_shape(value_node):  # eg: x.shape or x.shape[0]
                 static_shape_var_name = unique_name.generate(
-                    target_id + STATIC_CONVERT_VAR_SHAPE_SUFFIX)
+                    replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX)
                 static_shape_var_node = gast.parse(static_shape_var_name).body[
                     0].value
                 static_shape_value_node = copy.deepcopy(value_node)
                 # x.shape becomes convert_var_shape_simple(x)
                 static_shape_value_node = ShapeAttributeTransformer().visit(
                     static_shape_value_node)
+                # Declare static_shape_var_name as global var
                 update_static_shape_var_node = [
+                    gast.Global(names=[static_shape_var_name])
+                ]
+                update_static_shape_var_node.append(
                     gast.Assign(
                         targets=[static_shape_var_node],
-                        value=static_shape_value_node)
-                ]
+                        value=static_shape_value_node))
                 self.name_to_var_shape[target_id] = static_shape_var_name
         return update_static_shape_var_node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 1071fc1350bfeb8f8b768a81204f134b345101b5..001116a74c9cc5f149de8ab1ebd7f8f5c2f68068 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -79,6 +79,7 @@ FOR_ITER_TUPLE_PREFIX = '__for_loop_iter_tuple'
 FOR_ITER_TUPLE_INDEX_PREFIX = '__for_loop_iter_tuple_index'
 FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len'
 FOR_ITER_VAR_NAME_PREFIX = '__for_loop_iter_var'
+FOR_ITER_ZIP_TO_LIST_PREFIX = '__for_loop_iter_zip'
 
 # FullArgSpec is valid from Python3. Defined a Namedtuple to
 # to make it available in Python2.
@@ -381,9 +382,15 @@ def get_attribute_full_name(node):
     return astor.to_source(gast.gast_to_ast(node)).strip()
 
 
-def generate_name_node(name_ids, ctx=gast.Load()):
+def generate_name_node(name_ids, ctx=gast.Load(), gen_tuple_if_single=False):
     """
-    Generate list or gast.Tuple of ast.Name for Return statement.
+    If name_ids is list or tuple or set with multiple strings, this function
+    generates gast.Tuple of gast.Name.
+    If the name_ids is single string or contains only 1 string, this function
+    returns gast.Name if gen_tuple_if_single==False else returns gast.Tuple
+    with only one gast.Name
+
+    This function is used at several gast.Return statements.
     """
     if isinstance(name_ids, six.string_types):
         name_ids = [name_ids]
@@ -395,7 +402,7 @@ def generate_name_node(name_ids, ctx=gast.Load()):
             id=name_id, ctx=ctx, annotation=None, type_comment=None)
         for name_id in name_ids
     ]
-    if len(gast_names) == 1:
+    if len(gast_names) == 1 and not gen_tuple_if_single:
         name_node = gast_names[0]
     else:
         name_node = gast.Tuple(elts=gast_names, ctx=ctx)
@@ -1006,6 +1013,9 @@ class ForNodeVisitor(object):
         #   - for i, x enumerate(var|var.numpy())
         #   - for x in var
         self.iter_var_len_name = unique_name.generate(FOR_ITER_VAR_LEN_PREFIX)
+        # - created zip to list var : __for_loop_iter_zip_0
+        self.iter_zip_to_list_name = unique_name.generate(
+            FOR_ITER_ZIP_TO_LIST_PREFIX)
 
         # - var.numpy()/var
         #   - for x in var|var.numpy()
@@ -1077,6 +1087,7 @@ class ForNodeVisitor(object):
 
     def _parse_for_stmts(self):
         init_stmts = []
+        init_stmts.extend(self._build_iter_node())
         init_stmts.append(self._build_index_init_node())
         init_stmts.append(self._build_var_len_assign_node())
 
@@ -1099,6 +1110,7 @@ class ForNodeVisitor(object):
 
     def _parse_for_enumerate_stmts(self):
         init_stmts = []
+        init_stmts.extend(self._build_iter_node())
         init_stmts.append(self._build_index_init_node())
         init_stmts.append(self._build_var_len_assign_node())
         init_stmts.append(self._build_enum_init_node())
@@ -1157,6 +1169,34 @@ class ForNodeVisitor(object):
 
         return convert_len_node
 
+    def _build_iter_node(self):
+        """
+        Process special cases for iter_node inclue:
+          - Case 1 (for zip):
+            
+            - for i, val in enumerate(zip(x, y))  # original code:
+            
+            - __for_loop_iter_zip_0 = list(zip(x, y))
+            - for i, val in enumerate(__for_loop_iter_zip_0)
+        """
+        new_nodes = []
+        if isinstance(self.iter_node, gast.Call) and isinstance(
+                self.iter_node.func, gast.Name):
+            if self.iter_node.func.id == 'zip':
+                iter_var_name = ast_to_source_code(self.iter_node).strip()
+                zip_to_list_str = "{target} = list({value})".format(
+                    target=self.iter_zip_to_list_name, value=iter_var_name)
+                zip_to_list_node = gast.parse(zip_to_list_str).body[0]
+                new_nodes.append(zip_to_list_node)
+
+                self.iter_node = gast.Name(
+                    id=self.iter_zip_to_list_name,
+                    ctx=gast.Load(),
+                    annotation=None,
+                    type_comment=None)
+
+        return new_nodes
+
     def _build_enum_init_node(self):
         if self.is_for_enumerate_iter() and self.args_length != 1:
             init_value_str = ast_to_source_code(self.iter_args[1]).strip()
@@ -1393,6 +1433,7 @@ def input_specs_compatible(src_input_specs, desired_input_specs):
         for spec in src_input_specs:
             if spec not in desired_input_specs:
                 return False
+
     else:
         for i in range(len_specs):
             src_shape = src_input_specs[i].shape
diff --git a/python/paddle/fluid/dygraph/inplace_utils.py b/python/paddle/fluid/dygraph/inplace_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f7ef9b691c0978246dd9b4b3445a28f2222336
--- /dev/null
+++ b/python/paddle/fluid/dygraph/inplace_utils.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..wrapped_decorator import wrap_decorator
+from ..framework import in_dygraph_mode
+import warnings
+import paddle
+
+
+# NOTE(pangyoki): The Inplace APIs with underline(`_`) is only valid for the method of calling `core.ops`
+# in dygraph mode. If static mode is used, the inplace mechanism will not be used, and the static method
+# of the original API will be called.
+def _inplace_apis_in_dygraph_only_(func):
+    def __impl__(*args, **kwargs):
+        if not in_dygraph_mode():
+            origin_api_name = func.__name__[:-1]
+            warnings.warn(
+                "In static mode, {}() is the same as {}() and does not perform inplace operation.".
+                format(func.__name__, origin_api_name))
+            origin_func = "{}.{}".format(func.__module__, origin_api_name)
+            return eval(origin_func)(*args, **kwargs)
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+inplace_apis_in_dygraph_only = wrap_decorator(_inplace_apis_in_dygraph_only_)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index af4ba16ee8f64cb6293dd11413492e013c32b99d..33eb16f1b2b44cce8a979f062e2620a0f351e27f 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -413,6 +413,23 @@ class _ProgramHolder(object):
         # Therefore, in order to reuse the method of backward.py, build the program here.
         program = _build_program_by_desc(program_desc_copy)
 
+        # 3. Add the outputs which is only used for training and not saved in
+        # inference program.
+        for block_idx in six.moves.range(program.num_blocks):
+            block = program.block(block_idx)
+            for op in block.ops:
+                if op.type == "batch_norm":
+                    if "ReserveSpace" not in op.output_names or len(
+                            op.output("ReserveSpace")) == 0:
+                        reserve_space = block.create_var(
+                            name=unique_name.generate_with_ignorable_key(
+                                ".".join(["reserve_space", 'tmp'])),
+                            dtype=block.var(op.input("X")[0]).dtype,
+                            type=core.VarDesc.VarType.LOD_TENSOR,
+                            persistable=False,
+                            stop_gradient=True)
+                        op.desc.set_output("ReserveSpace", [reserve_space.name])
+
         targets = []
         for out in self._output_descs:
             targets.append(program.global_block().var(out.name()))
@@ -633,6 +650,7 @@ def _construct_params_and_buffers(model_path,
                                   append_suffix=True):
     var_info_filename = str(params_filename) + ".info"
     var_info_path = os.path.join(model_path, var_info_filename)
+    params_path = os.path.join(model_path, str(params_filename))
 
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
@@ -654,6 +672,9 @@ def _construct_params_and_buffers(model_path,
             var_dict.update(
                 _load_persistable_vars(model_path, var_info_path, programs[
                     func_name], file_name))
+    elif params_filename is not None and not os.path.exists(params_path):
+        # When saving XX, there is only '*.pdmodel'
+        return dict()
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 4b35d77845970378f0acdd21a3ce9c5d55c1ae27..352a377fa3adc557213abcc8f4919e0d30cae97a 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -19,6 +19,7 @@ import pickle
 import warnings
 import functools
 from collections import OrderedDict
+import inspect
 
 import six
 import paddle
@@ -168,7 +169,7 @@ def declarative(function=None, input_spec=None):
 
     Args:
         function (callable): callable imperative function.
-        input_spec(list[InputSpec]): list of InputSpec to specific the shape/dtype/name
+        input_spec(list[InputSpec]|tuple[InputSpec]): list/tuple of InputSpec to specific the shape/dtype/name
             information of each input Tensor.
 
     Returns:
@@ -506,36 +507,40 @@ def _build_load_path_and_config(path, config):
 @switch_to_static_graph
 def save(layer, path, input_spec=None, **configs):
     """
-    Saves input Layer as ``paddle.jit.TranslatedLayer``
+    Saves input Layer or function as ``paddle.jit.TranslatedLayer``
     format model, which can be used for inference or fine-tuning after loading.
 
-    It will save the translated program and all related persistable 
+    It will save the translated program and all related persistable
     variables of input Layer to given ``path`` .
-    
-    ``path`` is the prefix of saved objects, and the saved translated program file 
+
+    ``path`` is the prefix of saved objects, and the saved translated program file
     suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` ,
-    and here also saved some additional variable description information to a file,  
+    and here also saved some additional variable description information to a file,
     its suffix is ``.pdiparams.info``, these additional information is used in fine-tuning.
 
     The saved model can be loaded by follow APIs:
-      - ``paddle.jit.load`` 
-      - ``paddle.static.load_inference_model`` 
+      - ``paddle.jit.load``
+      - ``paddle.static.load_inference_model``
       - Other C++ inference APIs
 
+    .. note::
+        When using ``paddle.jit.save`` to save a function, parameters will not be saved. If you have to 
+        save the parameter, please pass the Layer containing function and parameter to ``paddle.jit.save``.
+
     Args:
-        layer (Layer): The Layer to be saved.
+        layer (Layer|function): The Layer or function to be saved.
         path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
-        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward 
-            method, which can be described by InputSpec or example Tensor. If None, all input variables of 
+        input_spec (list[InputSpec|Tensor]|tuple[InputSpec|Tensor], optional): Describes the input of the saved model's forward
+            method, which can be described by InputSpec or example Tensor. If None, all input variables of
             the original Layer's forward method would be the inputs of the saved model. Default None.
-        **configs (dict, optional): Other save configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): Other save configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
             (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
-            By default, all return variables of original Layer's forward method are kept as the 
-            output of the saved model. If the provided ``output_spec`` list is not all output variables, 
-            the saved model will be pruned according to the given ``output_spec`` list. 
+            By default, all return variables of original Layer's forward method are kept as the
+            output of the saved model. If the provided ``output_spec`` list is not all output variables,
+            the saved model will be pruned according to the given ``output_spec`` list.
 
     Returns:
         None
@@ -543,6 +548,7 @@ def save(layer, path, input_spec=None, **configs):
     Examples:
         .. code-block:: python
 
+            # example 1: save layer
             import numpy as np
             import paddle
             import paddle.nn as nn
@@ -609,6 +615,28 @@ def save(layer, path, input_spec=None, **configs):
             # save
             path = "example_model/linear"
             paddle.jit.save(layer, path)
+
+            # example 2: save function
+            import paddle
+            from paddle.static import InputSpec
+
+
+            def save_function():
+                @paddle.jit.to_static
+                def fun(inputs):
+                    return paddle.tanh(inputs)
+
+                path = 'test_jit_save_load_function_1/func'
+                inps = paddle.rand([3, 6])
+                origin = fun(inps)
+
+                paddle.jit.save(fun, path)
+                load_func = paddle.jit.load(path)
+
+                load_result = load_func(inps)
+                print((load_result - origin).abs().max() < 1e-10)
+                
+            save_function()
     """
 
     # 1. input build & check
@@ -617,9 +645,11 @@ def save(layer, path, input_spec=None, **configs):
         raise RuntimeError(
             "The paddle.jit.save doesn't work when setting ProgramTranslator.enable to False."
         )
-    if not isinstance(layer, Layer):
+
+    if not (isinstance(layer, Layer) or inspect.isfunction(layer) or isinstance(
+            layer, StaticFunction)):
         raise TypeError(
-            "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
+            "The input of paddle.jit.save should be 'Layer' or 'Function', but received input type is %s."
             % type(layer))
 
     # NOTE(chenweihang): If the input layer be wrapped by DataParallel,
@@ -647,14 +677,16 @@ def save(layer, path, input_spec=None, **configs):
     # avoid change user given input_spec
     inner_input_spec = None
     if input_spec is not None:
-        for attr_func in dir(inner_layer):
-            static_func = getattr(inner_layer, attr_func, None)
-            if isinstance(static_func,
-                          StaticFunction) and 'forward' != attr_func:
-                raise ValueError(
-                    "If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is %s."
-                    % type(input_spec))
-        if not isinstance(input_spec, list):
+        if isinstance(layer, Layer):
+            for attr_func in dir(inner_layer):
+                static_func = getattr(inner_layer, attr_func, None)
+                if isinstance(static_func,
+                              StaticFunction) and 'forward' != attr_func:
+                    raise ValueError(
+                        "If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is %s."
+                        % type(input_spec))
+
+        if not isinstance(input_spec, (list, tuple)):
             raise TypeError(
                 "The input input_spec should be 'list', but received input_spec's type is %s."
                 % type(input_spec))
@@ -674,29 +706,74 @@ def save(layer, path, input_spec=None, **configs):
     configs = _parse_save_configs(configs)
     scope = core.Scope()
     extra_var_info = dict()
-    for attr_func in dir(inner_layer):
-        static_func = getattr(inner_layer, attr_func, None)
-        if isinstance(static_func, StaticFunction):
-            concrete_program = static_func.concrete_program_specify_input_spec(
-                inner_input_spec)
-        elif 'forward' == attr_func:
-            # transform in jit.save, if input_spec is incomplete, declarative will throw error
-            # inner_input_spec is list[InputSpec], it should be packed with same sturcture
-            # as original input_spec here.
-            if inner_input_spec:
-                inner_input_spec = pack_sequence_as(input_spec,
-                                                    inner_input_spec)
-            static_forward = declarative(
-                inner_layer.forward, input_spec=inner_input_spec)
-            concrete_program = static_forward.concrete_program
-            # the input_spec has been used in declarative, which is equal to
-            # @declarative with input_spec and jit.save without input_spec,
-            # avoid needless warning
-            inner_input_spec = None
+    if isinstance(layer, Layer):
+        functions = dir(inner_layer)
+    else:
+        # layer is function
+        functions = [layer, ]
+    for attr_func in functions:
+        if isinstance(layer, Layer):
+            static_func = getattr(inner_layer, attr_func, None)
+            if isinstance(static_func, StaticFunction):
+                concrete_program = static_func.concrete_program_specify_input_spec(
+                    inner_input_spec)
+            elif 'forward' == attr_func:
+                # transform in jit.save, if input_spec is incomplete, declarative will throw error
+                # inner_input_spec is list[InputSpec], it should be packed with same sturcture
+                # as original input_spec here.
+                if inner_input_spec:
+                    inner_input_spec = pack_sequence_as(input_spec,
+                                                        inner_input_spec)
+                static_forward = declarative(
+                    inner_layer.forward, input_spec=inner_input_spec)
+                concrete_program = static_forward.concrete_program
+                # the input_spec has been used in declarative, which is equal to
+                # @declarative with input_spec and jit.save without input_spec,
+                # avoid needless warning
+                inner_input_spec = None
+            else:
+                continue
+
+            # NOTE(chenweihang): we maintain the mapping of variable name to
+            # structured name, the buffer variable (non-persistable)
+            # saved to inference program may not need by dygraph Layer,
+            # we only record the state_dict variable's structured name
+            state_names_dict = dict()
+            for structured_name, var in six.iteritems(inner_layer.state_dict()):
+                state_names_dict[var.name] = structured_name
+
+            # 3. share parameters from Layer to scope & record var info
+            for param_or_buffer in concrete_program.parameters:
+                # share to scope
+                param_or_buffer_tensor = scope.var(
+                    param_or_buffer.name).get_tensor()
+                src_tensor = param_or_buffer.value().get_tensor()
+                param_or_buffer_tensor._share_data_with(src_tensor)
+                # record var info
+                if param_or_buffer.name not in extra_var_info:
+                    extra_info_dict = dict()
+                    if param_or_buffer.name in state_names_dict:
+                        extra_info_dict['structured_name'] = state_names_dict[
+                            param_or_buffer.name]
+                    extra_info_dict[
+                        'stop_gradient'] = param_or_buffer.stop_gradient
+                    if isinstance(param_or_buffer, ParamBase):
+                        extra_info_dict['trainable'] = param_or_buffer.trainable
+                    extra_var_info[param_or_buffer.name] = extra_info_dict
         else:
-            continue
-
-        # 3. build input & output of save_infernece_model
+            # When layer is a function
+            if isinstance(attr_func, StaticFunction):
+                concrete_program = attr_func.concrete_program_specify_input_spec(
+                    inner_input_spec)
+            else:
+                if inner_input_spec:
+                    inner_input_spec = pack_sequence_as(input_spec,
+                                                        inner_input_spec)
+                static_function = declarative(
+                    attr_func, input_spec=inner_input_spec)
+                concrete_program = static_function.concrete_program
+
+        # 4. build input & output of save_infernece_model
         # NOTE(chenweihang): [ Get input variables name ]
         # There are two cases, whether to prune the inputs or not
         # - not prune inputs (recommend):
@@ -715,32 +792,6 @@ def save(layer, path, input_spec=None, **configs):
         output_vars = _get_output_vars(concrete_program.outputs,
                                        configs.output_spec)
 
-        # NOTE(chenweihang): we maintain the mapping of variable name to
-        # structured name, the buffer variable (non-persistable)
-        # saved to inference program may not need by dygraph Layer,
-        # we only record the state_dict variable's structured name
-        state_names_dict = dict()
-        for structured_name, var in six.iteritems(inner_layer.state_dict()):
-            state_names_dict[var.name] = structured_name
-
-        # 4. share parameters from Layer to scope & record var info
-        for param_or_buffer in concrete_program.parameters:
-            # share to scope
-            param_or_buffer_tensor = scope.var(param_or_buffer.name).get_tensor(
-            )
-            src_tensor = param_or_buffer.value().get_tensor()
-            param_or_buffer_tensor._share_data_with(src_tensor)
-            # record var info
-            if param_or_buffer.name not in extra_var_info:
-                extra_info_dict = dict()
-                if param_or_buffer.name in state_names_dict:
-                    extra_info_dict['structured_name'] = state_names_dict[
-                        param_or_buffer.name]
-                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
-                if isinstance(param_or_buffer, ParamBase):
-                    extra_info_dict['trainable'] = param_or_buffer.trainable
-                extra_var_info[param_or_buffer.name] = extra_info_dict
-
         # 5. save inference model
         from paddle.fluid.io import save_inference_model
 
@@ -748,7 +799,7 @@ def save(layer, path, input_spec=None, **configs):
         model_path = dirname
         # NOTE(chenweihang): because prefix contains model and params filename,
         # so we don't support set model_filename & params_filename
-        if 'forward' == attr_func:
+        if 'forward' == attr_func or not isinstance(layer, Layer):
             model_filename = file_prefix + INFER_MODEL_SUFFIX
             params_filename = file_prefix + INFER_PARAMS_SUFFIX
         else:
@@ -782,10 +833,11 @@ def save(layer, path, input_spec=None, **configs):
     # but we can save these information in `jit.save` without changing the original
     # storage to improve user experience. So we save extra information into
     # file `***.pdiparams.info`
-    with scope_guard(scope):
-        extra_var_info_path = path + INFER_PARAMS_INFO_SUFFIX
-        with open(extra_var_info_path, 'wb') as f:
-            pickle.dump(extra_var_info, f, protocol=2)
+    if isinstance(layer, Layer) and extra_var_info:
+        with scope_guard(scope):
+            extra_var_info_path = path + INFER_PARAMS_INFO_SUFFIX
+            with open(extra_var_info_path, 'wb') as f:
+                pickle.dump(extra_var_info, f, protocol=2)
 
 
 @dygraph_only
@@ -793,8 +845,8 @@ def load(path, **configs):
     """
     :api_attr: imperative
 
-    Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or 
-    paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``, 
+    Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or
+    paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``,
     then performing inference or fine-tune training.
 
     .. note::
@@ -807,14 +859,14 @@ def load(path, **configs):
 
     Args:
         path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix`` .
-        **configs (dict, optional): Other load configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): Other load configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (str): The inference model file name of the paddle 1.x 
-            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
-            (2) params_filename (str): The persistable variables file name of the paddle 1.x 
-            ``save_inference_model`` save format. No default file name, save variables separately 
+            (1) model_filename (str): The inference model file name of the paddle 1.x
+            ``save_inference_model`` save format. Default file name is :code:`__model__` .
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x
+            ``save_inference_model`` save format. No default file name, save variables separately
             by default.
 
 
@@ -960,7 +1012,7 @@ def load(path, **configs):
             loader = paddle.io.DataLoader(dataset,
                 feed_list=[image, label],
                 places=place,
-                batch_size=BATCH_SIZE, 
+                batch_size=BATCH_SIZE,
                 shuffle=True,
                 drop_last=True,
                 num_workers=2)
@@ -969,7 +1021,7 @@ def load(path, **configs):
             for data in loader():
                 exe.run(
                     static.default_main_program(),
-                    feed=data, 
+                    feed=data,
                     fetch_list=[avg_loss])
 
             model_path = "fc.example.model"
@@ -1052,7 +1104,7 @@ def _trace(layer,
 class TracedLayer(object):
     """
     :api_attr: imperative
-    
+
     TracedLayer is used to convert a forward dygraph model to a static
     graph model. This is mainly used to save the dygraph model for online
     inference using C++. Besides, users can also do inference in Python
@@ -1132,7 +1184,7 @@ class TracedLayer(object):
                     def forward(self, input):
                         return self._fc(input)
 
-                
+
                 layer = ExampleLayer()
                 in_var = paddle.uniform(shape=[2, 3], dtype='float32')
                 out_dygraph, static_layer = paddle.jit.TracedLayer.trace(layer, inputs=[in_var])
@@ -1244,13 +1296,16 @@ class TracedLayer(object):
             return self._run(self._build_feed(inputs))
 
     @switch_to_static_graph
-    def save_inference_model(self, dirname, feed=None, fetch=None):
+    def save_inference_model(self, path, feed=None, fetch=None):
         """
         Save the TracedLayer to a model for inference. The saved
         inference model can be loaded by C++ inference APIs.
 
+        ``path`` is the prefix of saved objects, and the saved translated program file
+        suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` .
+
         Args:
-            dirname (str): the directory to save the inference model.
+            path(str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
             feed (list[int], optional): the input variable indices of the saved
                 inference model. If None, all input variables of the
                 TracedLayer object would be the inputs of the saved inference
@@ -1294,7 +1349,7 @@ class TracedLayer(object):
                 fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars)
                 print(fetch.shape) # (2, 10)
         """
-        check_type(dirname, "dirname", str,
+        check_type(path, "path", str,
                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
         check_type(feed, "feed", (type(None), list),
                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
@@ -1309,6 +1364,18 @@ class TracedLayer(object):
                 check_type(f, "each element of fetch", int,
                            "fluid.dygraph.jit.TracedLayer.save_inference_model")
 
+        # path check
+        file_prefix = os.path.basename(path)
+        if file_prefix == "":
+            raise ValueError(
+                "The input path MUST be format of dirname/file_prefix "
+                "[dirname\\file_prefix in Windows system], but received "
+                "file_prefix is empty string.")
+
+        dirname = os.path.dirname(path)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname)
+
         from paddle.fluid.io import save_inference_model
 
         def get_feed_fetch(all_vars, partial_vars):
@@ -1326,9 +1393,14 @@ class TracedLayer(object):
                 assert target_var is not None, "{} cannot be found".format(name)
                 target_vars.append(target_var)
 
+            model_filename = file_prefix + INFER_MODEL_SUFFIX
+            params_filename = file_prefix + INFER_PARAMS_SUFFIX
+
             save_inference_model(
                 dirname=dirname,
                 feeded_var_names=feeded_var_names,
                 target_vars=target_vars,
                 executor=self._exe,
-                main_program=self._program.clone())
+                main_program=self._program.clone(),
+                model_filename=model_filename,
+                params_filename=params_filename)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index b157ce81d82fc7f6d2231fc22a5246b65de31035..18dfff434a2aafea480cd08bd3e03d3ca70855d6 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -22,6 +22,9 @@ import copy
 import weakref
 import warnings
 from copy import deepcopy
+import inspect
+
+import paddle
 
 from . import parallel_helper
 from .. import unique_name
@@ -33,6 +36,7 @@ from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.framework import in_dygraph_mode
 from paddle.fluid.framework import _current_expected_place as _get_device
+from paddle.fluid.dygraph import no_grad
 import paddle.utils.deprecated as deprecated
 
 __all__ = ['Layer']
@@ -512,9 +516,6 @@ class Layer(core.Layer):
     def parameters(self, include_sublayers=True):
         """Returns a list of all Parameters from current layer and its sub-layers.
 
-        Parameters:
-            include_sublayers(bool, optional): Whether include the parameters of sublayers. If True, also include the parameters from sublayers. Default: True
-
         Returns:
             list of Tensor : a list of Parameters.
 
@@ -584,11 +585,11 @@ class Layer(core.Layer):
                 memo.add(layer)
                 yield name, layer
 
-    def sublayers(self, include_sublayers=True):
+    def sublayers(self, include_self=False):
         """Returns a list of sub layers.
 
         Parameters:
-            include_sublayers(bool, optional): Whether return the sublayers of sublayers. If True, also include the sublayers of sublayers. Default: True
+            include_self(bool, optional): Whether return self as sublayers. Default: False
 
         Returns:
             list of Layer : a list of sub layers.
@@ -615,8 +616,7 @@ class Layer(core.Layer):
         """
         ret = [
             layer
-            for _, layer in self.named_sublayers(
-                include_sublayers=include_sublayers)
+            for _, layer in self.named_sublayers(include_self=include_self)
         ]
         return ret
 
@@ -647,8 +647,7 @@ class Layer(core.Layer):
         params_set = set()
         named_sublayers = self.named_sublayers(
             prefix=prefix,
-            include_sublayers=include_sublayers,
-            include_self=True)
+            include_self=True) if include_sublayers else zip([prefix], [self])
         for layer_prefix, sublayer in named_sublayers:
             params = sublayer._parameters.items()
             for key, param in params:
@@ -658,18 +657,13 @@ class Layer(core.Layer):
                 name = layer_prefix + ('.' if layer_prefix else '') + key
                 yield name, param
 
-    def named_sublayers(self,
-                        prefix='',
-                        include_sublayers=True,
-                        include_self=False,
-                        layers_set=None):
+    def named_sublayers(self, prefix='', include_self=False, layers_set=None):
         """
         Returns an iterator over all sublayers in the Layer, yielding tuple of name and sublayer.
         The duplicate sublayer will only be yielded once.
 
         Parameters:
             prefix(str, optional): Prefix to prepend to all parameter names. Default: ''.
-            include_sublayers(bool, optional): Whether include the sublayers. Default: True.
             include_self(bool, optional): Whether include the Layer itself. Default: False.
             layers_set(set, optioanl): The set to record duplicate sublayers. Default: None.
 
@@ -693,17 +687,14 @@ class Layer(core.Layer):
         if include_self and self not in layers_set:
             layers_set.add(self)
             yield prefix, self
-        if include_sublayers:
-            for key, layer in self._sub_layers.items():
-                if layer is None:
-                    continue
-                layer_prefix = prefix + ('.' if prefix else '') + key
-                for p, l in layer.named_sublayers(
-                        prefix=layer_prefix,
-                        include_sublayers=include_sublayers,
-                        include_self=True,
-                        layers_set=layers_set):
-                    yield p, l
+        for key, layer in self._sub_layers.items():
+            if layer is None:
+                continue
+            layer_prefix = prefix + ('.' if prefix else '') + key
+            for p, l in layer.named_sublayers(
+                    prefix=layer_prefix, include_self=True,
+                    layers_set=layers_set):
+                yield p, l
 
     def register_buffer(self, name, tensor, persistable=True):
         """
@@ -840,8 +831,7 @@ class Layer(core.Layer):
         buffers_set = set()
         named_sublayers = self.named_sublayers(
             prefix=prefix,
-            include_sublayers=include_sublayers,
-            include_self=True)
+            include_self=True) if include_sublayers else zip([prefix], [self])
         for layer_prefix, sublayer in named_sublayers:
             buffers = sublayer._buffers.items()
             for key, buffer in buffers:
@@ -894,9 +884,15 @@ class Layer(core.Layer):
             if not self._built:
                 with program_desc_tracing_guard(False):
                     self._build_once(*inputs, **kwargs)
-                    if parallel_helper._is_data_parallel_mode():
+
+                    # TODO(liuyuhui) Only xpu broadcast parameters here. 
+                    # The other device is to call _sync_params_buffers in DataParallel 
+                    # to realize the parameter synchronization among multiply cards.
+                    if parallel_helper._is_data_parallel_mode(
+                    ) and paddle.is_compiled_with_xpu():
                         parallel_helper._broadcast_parameters(
                             self._parameters.values())
+
                 self._built = True
 
             outputs = self.forward(*inputs, **kwargs)
@@ -1253,16 +1249,12 @@ class Layer(core.Layer):
         return destination
 
     @framework.deprecate_stat_dict
-    def set_state_dict(self,
-                       state_dict,
-                       include_sublayers=True,
-                       use_structured_name=True):
+    def set_state_dict(self, state_dict, use_structured_name=True):
         '''
         Set parameters and persistable buffers from state_dict. All the parameters and buffers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            include_sublayers(bool, optional) : If true, also include the parameters and peresistable buffers from sublayers. Default: True
             use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key. 
                                                   Default: True
         Returns:
@@ -1287,10 +1279,12 @@ class Layer(core.Layer):
             if state is None:
                 raise ValueError("{} is not found in the provided dict.".format(
                     key))
-            if list(state.shape) != list(param.shape):
+            state_shape = state.shape() if inspect.ismethod(
+                state.shape) else state.shape
+            if list(state_shape) != list(param.shape):
                 raise ValueError(
                     "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
+                    format(key, list(state_shape), list(param.shape)))
             return param, state
 
         matched_param_state = []
@@ -1332,6 +1326,114 @@ class Layer(core.Layer):
             for param, state in matched_param_state:
                 _set_var(param, state)
 
+    def _apply(self, func, device, dtype, blocking):
+        for layer in self.children():
+            layer._apply(func, device, dtype, blocking)
+
+        for key, param in self._parameters.items():
+            if param is not None:
+                with no_grad():
+                    param_applied = func(param, device, dtype, blocking)
+                    assert param.is_leaf
+                    param_applied.stop_gradient = param.stop_gradient
+                    self._parameters[key] = param_applied
+
+                if param.grad is not None:
+                    with no_grad():
+                        grad_applied = func(param._grad_ivar(), device, dtype,
+                                            blocking)
+
+                        grad_applied.stop_gradient = param._grad_ivar(
+                        ).stop_gradient
+                        self._parameters[key]._set_grad_ivar(grad_applied)
+
+        for key, buf in self._buffers.items():
+            self._buffers[key] = func(buf, device, dtype, blocking)
+
+    def to(self, device=None, dtype=None, blocking=None):
+        '''
+        Cast the parameters and buffers of Layer by the give device, dtype and blocking.
+
+        Parameters:
+            device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored. 
+            If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the 
+            index of the GPUs or XPUs. Default: None. 
+            
+            dtype(str|core.VarDesc.VarType|None, optional): The type of the data. If None, the dtype is the same with the original Tensor. Default: None.
+
+            blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be 
+              asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
+            
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                linear=paddle.nn.Linear(2, 2)
+                linear.weight
+                #Parameter containing:
+                #Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+                #       [[-0.32770029,  0.38653070],
+                #        [ 0.46030545,  0.08158520]])
+
+                linear.to(dtype='float64')
+                linear.weight
+                #Tenor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=False,
+                #       [[-0.32770029,  0.38653070],
+                #        [ 0.46030545,  0.08158520]])
+
+                linear.to(device='cpu')
+                linear.weight
+                #Tensor(shape=[2, 2], dtype=float64, place=CPUPlace, stop_gradient=False,
+                #       [[-0.32770029,  0.38653070],
+                #        [ 0.46030545,  0.08158520]])
+                linear.to(device=paddle.CUDAPinnedPlace(), blocking=False)
+                linear.weight
+                #Tensor(shape=[2, 2], dtype=float64, place=CUDAPinnedPlace, stop_gradient=False,
+                #       [[-0.04989364, -0.56889004],
+                #        [ 0.33960250,  0.96878713]])
+    
+
+        '''
+
+        if device is None and dtype is None and blocking is None:
+            return
+
+        if device is not None:
+            if isinstance(device, str):
+                device = paddle.device._convert_to_place(device)
+            elif isinstance(device, (core.CPUPlace, core.CUDAPlace,
+                                     core.CUDAPinnedPlace, core.XPUPlace)):
+                pass
+            else:
+                raise ValueError(
+                    "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
+                    + type(device).__name__)
+
+        if blocking is None:
+            blocking = True
+        else:
+            assert isinstance(
+                blocking,
+                bool), "blocking value error, must be the True, False or None"
+
+        def transform(t, device, dtype, blocking):
+            if device is None:
+                device = t.place
+            if dtype is None:
+                dtype = t.dtype
+
+            new_t = t._copy_to(device, blocking)
+            if dtype is not None and dtype != t.dtype:
+                new_t = new_t.cast(dtype=dtype)
+
+            return new_t
+
+        self._apply(transform, device, dtype, blocking)
+
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
     load_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 1df3e31ae4b269e8afdb65f97235aba7a3c4b549..e39fc3e23fe5648153df9bc1be270ed750ff6362 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -29,6 +29,7 @@ _supported_int_dtype_ = [
     core.VarDesc.VarType.INT16,
     core.VarDesc.VarType.INT32,
     core.VarDesc.VarType.INT64,
+    core.VarDesc.VarType.BOOL,
 ]
 
 # NOTE(chenweihang): We currently do not fully support the type promotion 
@@ -324,13 +325,7 @@ def monkey_patch_math_varbase():
     else:
         import paddle.tensor
         # Tensor method from module paddle.tensor
-        tensor_methods = paddle.tensor.linalg.__all__ + \
-                         paddle.tensor.math.__all__ + \
-                         paddle.tensor.logic.__all__ + \
-                         paddle.tensor.manipulation.__all__ + \
-                         paddle.tensor.search.__all__ + \
-                         paddle.tensor.stat.__all__ + \
-                         paddle.tensor.attribute.__all__
+        tensor_methods = paddle.tensor.tensor_method_func
         for method_name in tensor_methods:
             if hasattr(core.VarBase, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 6decff69ad65cfa4c51468d6892e65bda1386028..ce728f1121dfdbc04dc123c3976539ec143fc9d6 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -174,6 +174,11 @@ class Conv2D(layers.Layer):
                  dtype='float32'):
         assert param_attr is not False, "param_attr should not be False here."
         super(Conv2D, self).__init__()
+
+        if (core.is_compiled_with_cuda() and paddle.fluid.get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+            use_cudnn = False
+
         self._num_channels = num_channels
         self._groups = groups
         self._stride = utils.convert_to_list(stride, 2, 'stride')
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 2ef72f6c5aaf4bc39b5eb71ac6ba64d0829c0475..2be062962ec9d33493bbcfc9a6c06239f2d4dee7 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -24,6 +24,8 @@ from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.dygraph import to_variable, no_grad
 from paddle.utils import deprecated
+from ..layers import collective
+from paddle.fluid.dygraph import base as imperative_base
 import warnings
 import paddle
 import itertools
@@ -319,6 +321,62 @@ def scale_loss(loss):
     return scaled_loss
 
 
+@imperative_base.no_grad
+@framework.dygraph_only
+def build_groups(vars, group_size):
+    group_idx = 0
+    memory_counter = 0
+    var_groups = OrderedDict()
+    dtype = vars[0].dtype
+
+    for var in vars:
+        bytes = np.prod(var.shape) * core.size_of_dtype(var.dtype)
+        if memory_counter < group_size and dtype == var.dtype:
+            memory_counter += bytes
+        else:
+            memory_counter = bytes
+            dtype = var.dtype
+            group_idx += 1
+        var_groups.setdefault(group_idx, []).append(var)
+    return _coalesce_tensors(var_groups)
+
+
+@imperative_base.no_grad
+@framework.dygraph_only
+def sync_params_buffers(model,
+                        comm_group=None,
+                        src_rank=0,
+                        is_model_parallel=False):
+    model_vars = []
+    for _, param in model.state_dict().items():
+        if not isinstance(param, core.VarBase):
+            raise TypeError("The data type of '%s' must be Varbase" %
+                            param.name)
+        # is_distributed param not need to sync when in mp mode
+        if is_model_parallel and param.is_distributed:
+            continue
+
+        model_vars.append(param.detach())
+    if len(model_vars) == 0:
+        return
+
+    # group size is 128M
+    coalesced_vars = build_groups(model_vars, 128 * 1024 * 1024)
+
+    for coalesced_var, _, _ in coalesced_vars:
+        paddle.distributed.broadcast(
+            coalesced_var, src=src_rank, group=comm_group, use_calc_stream=True)
+
+    for coalesced_var, origin_vars, var_shapes in coalesced_vars:
+        var_len = [np.prod(v_shape) for v_shape in var_shapes]
+        paddle.fluid.framework._dygraph_tracer().trace_op(
+            type='split',
+            inputs={'X': coalesced_var},
+            outputs={'Out': origin_vars},
+            attrs={'sections': var_len,
+                   'axis': 0})
+
+
 class DataParallel(layers.Layer):
     """
     Run the dygraph module with data parallelism.
@@ -348,13 +406,26 @@ class DataParallel(layers.Layer):
         last_comm_buffer_size(float, optional): It limits memory size(MB) of last buffer in communication
                                          calling. Making the last communication buffer size small is useful to 
                                          improve performance. Default: 1.
+        find_unused_parameters(bool, optional): Whether to traverse the entire backward graph from the
+                                                all tensors in the return value of the wrapped model's 
+                                                forward function. For parameters not involved in loss 
+                                                calculation, their gradients will be marked as ready in 
+                                                advance to prepare reduce. Please note that all forward 
+                                                outputs derived from the wrapped model parameters must 
+                                                participate in the calculation of loss and subsequent 
+                                                gradient calculations. If not, serious error will occur.
+                                                Note that setting the find_unused_parameters to True 
+                                                will affect computing performance. Therefore, if all parameters
+                                                are sure to participate in the loss calculation and the 
+                                                autograd graph construction, please set it False. Default: False.
             
     Returns:
         Layer: The data paralleled module.
 
     Examples:
         .. code-block:: python
-
+        
+            # required: distributed
             import paddle
             import paddle.nn as nn
             import paddle.optimizer as opt
@@ -403,11 +474,13 @@ class DataParallel(layers.Layer):
                  layers,
                  strategy=None,
                  comm_buffer_size=25,
-                 last_comm_buffer_size=1):
+                 last_comm_buffer_size=1,
+                 find_unused_parameters=False):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
+        self.find_unused_parameters = find_unused_parameters
 
         # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
         # It just stores some environment variables, which can be constructed by 
@@ -419,6 +492,17 @@ class DataParallel(layers.Layer):
             self._strategy = _build_default_parallel_strategy()
 
         if self._strategy.nranks > 1:
+            # check the environment
+            assert parallel_helper.__parallel_ctx__clz__ is not None, \
+            "ParallelContext must be initialized before. You should use init_parallel_env() before" \
+            "constructing the DataParallel."
+
+            # sync buffer and params
+            # TODO(liuyuhui) Currently not support xpu. xpu is 
+            # still broadcasting parameters when calling layer
+            if not paddle.is_compiled_with_xpu():
+                sync_params_buffers(self._layers)
+
             self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024)
             # NOTE(shenliang03): We can set environment variables to control 
             # the size of the group, Default: 1MB. The role of this small group is: 
@@ -449,6 +533,10 @@ class DataParallel(layers.Layer):
 
         trainable_parameters = [param for _, param in layers_param]
 
+        assert len(trainable_parameters) > 0, \
+            "This model does not have any parameters to train, and " \
+            "does not need to use DataParallel"
+
         # NOTE(shenliang03): Here we can only use the attributes to judge whether
         # parameter is sparse(or SelectedRows). The reason is that the sparse message
         # can't be obtained when bp hasn't happened yet. So if layer supports sparse parameter,
@@ -470,19 +558,12 @@ class DataParallel(layers.Layer):
             trainable_parameters, is_sparse_gradient,
             [self.last_comm_buffer_size, self.comm_buffer_size])
 
-        assert parallel_helper.__parallel_ctx__clz__ is not None, \
-            "ParallelContext must be initialized before. You should use init_parallel_env() before" \
-            "constructing the DataParallel."
-
-        # TODO(shenliang03) "find_unused_vars" interface will be exposed in the future 
-        # to handle control flow to process unused parameters
-        find_unused_vars = True
         self._reducer = core.Reducer(
             trainable_parameters,
             list(reversed(self.group_indices)), is_sparse_gradient,
             parallel_helper.__parallel_ctx__clz__,
             [self.last_comm_buffer_size, self.comm_buffer_size],
-            find_unused_vars)
+            self.find_unused_parameters)
 
     def _find_varbase(self, obj):
         if isinstance(obj, core.VarBase):
@@ -495,10 +576,9 @@ class DataParallel(layers.Layer):
 
     def forward(self, *inputs, **kwargs):
         outputs = self._layers(*inputs, **kwargs)
-        if self._strategy.nranks > 1:
+        if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad:
             self._reducer.prepare_for_backward(
                 list(self._find_varbase(outputs)))
-
         return outputs
 
     @deprecated(
@@ -555,16 +635,12 @@ class DataParallel(layers.Layer):
             structured_name_prefix=structured_name_prefix)
 
     @framework.deprecate_stat_dict
-    def set_state_dict(self,
-                       state_dict,
-                       include_sublayers=True,
-                       use_structured_name=True):
+    def set_state_dict(self, state_dict, use_structured_name=True):
         '''
         Set parameters and persistable buffers from state_dict. All the parameters and buffers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            include_sublayers(bool, optional) : If true, also include the parameters and peresistable buffers from sublayers. Default: True
             use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key. 
                                                   Default: True
         Returns:
@@ -590,9 +666,7 @@ class DataParallel(layers.Layer):
         '''
 
         self._layers.set_state_dict(
-            state_dict,
-            include_sublayers=include_sublayers,
-            use_structured_name=use_structured_name)
+            state_dict, use_structured_name=use_structured_name)
 
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index ac0944c5718908c50e9754283d7a1f25a71d1ce6..37900b7880a359be47ff7de1279587ca6a096cb4 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -14,16 +14,47 @@
 
 import inspect
 import numpy as np
+import warnings
+import weakref
 
 import paddle
 from .. import framework
 from .. import core
 from .. import unique_name
-from ..framework import Variable, Parameter, ParamBase
+from ..framework import Variable, Parameter, ParamBase, _getitem_impl_
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
+import paddle.utils.deprecated as deprecated
+
+
+class TensorHookRemoveHelper(object):
+    """
+    A helper class that for removing Tensor gradient's hook.
+    """
+
+    def __init__(self, tensor, hook_id):
+        self._tensor_ref = weakref.ref(tensor)
+        self._hook_id = hook_id
+
+    def remove(self):
+        """
+        Remove reference Tensor's hook.
+
+        Returns:
+            bool: Return True if removed successfully
+        """
+        tensor = self._tensor_ref()
+        if tensor is not None:
+            res = tensor._remove_grad_hook(self._hook_id)
+            if res is True:
+                return True
+            else:
+                warnings.warn(
+                    "The backward hook (ID: %d) of Tensor `%s` you want to remove does not exist or has been removed."
+                    % (self._hook_id, tensor.name), RuntimeWarning)
+        return False
 
 
 def monkey_patch_varbase():
@@ -133,7 +164,7 @@ def monkey_patch_varbase():
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, retain_graph=False):
+    def backward(self, grad_tensor=None, retain_graph=False):
         """
         Run backward of current Graph which starts from current Tensor.
 
@@ -142,17 +173,22 @@ def monkey_patch_varbase():
         You can clear gradient by ``Tensor.clear_grad()`` .
 
         Args:
+            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, 
+            the initial gradient values of the current Tensor would be Tensor filled with 1.0; 
+            if `grad_tensor` is not None, it must have the same length as the current Tensor.
+            Teh default value is None.
+
             retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
                 like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
                 :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
                 Defaults to False.
-
         Returns:
             NoneType: None
 
         Examples:
             .. code-block:: python
 
+                import paddle
                 x = paddle.to_tensor(5., stop_gradient=False)
                 for i in range(5):
                     y = paddle.pow(x, 4.0)
@@ -168,22 +204,52 @@ def monkey_patch_varbase():
                 print("{}".format(x.grad))
                 # 0.
 
+                grad_tensor=paddle.to_tensor(2.)
+                for i in range(5):
+                    y = paddle.pow(x, 4.0)
+                    y.backward(grad_tensor)
+                    print("{}: {}".format(i, x.grad))
+                # 0: [1000.]
+                # 1: [2000.]
+                # 2: [3000.]
+                # 3: [4000.]
+                # 4: [5000.]
+
         """
         if framework.in_dygraph_mode():
+            if grad_tensor is not None:
+                assert isinstance(
+                    grad_tensor, paddle.
+                    Tensor), "The type of grad_tensot must be paddle.Tensor"
+                assert grad_tensor.shape == self.shape, \
+                    "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
+                    grad_tensor.name, grad_tensor.shape, self.name, self.shape)
+
             if paddle.is_compiled_with_xpu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
-                scaled_loss._run_backward(framework._dygraph_tracer(),
-                                          retain_graph)
+                core.dygraph_run_backward([scaled_loss], [grad_tensor],
+                                          retain_graph,
+                                          framework._dygraph_tracer())
             else:
-                self._run_backward(framework._dygraph_tracer(), retain_graph)
+                core.dygraph_run_backward([self], [grad_tensor], retain_graph,
+                                          framework._dygraph_tracer())
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
 
     @framework.dygraph_only
+    @deprecated(
+        since="2.1.0",
+        level=1,
+        reason="Please use tensor.grad, which returns the tensor value of the gradient."
+    )
     def gradient(self):
         """
+        .. warning::
+          This API will be deprecated in the future, it is recommended to use
+          :code:`x.grad` which returns the tensor value of the gradient.
+
         Get the Gradient of Current Tensor.
 
         Returns:
@@ -197,7 +263,7 @@ def monkey_patch_varbase():
                 x = paddle.to_tensor(5., stop_gradient=False)
                 y = paddle.pow(x, 4.0)
                 y.backward()
-                print("grad of x: {}".format(x.grad))
+                print("grad of x: {}".format(x.gradient()))
                 # [500.]
 
         """
@@ -211,13 +277,101 @@ def monkey_patch_varbase():
         else:
             return np.array(new_ivar.value().get_tensor())
 
+    @framework.dygraph_only
+    def register_hook(self, hook):
+        """
+        Registers a backward hook for current Tensor.
+
+        The hook will be called every time the gradient Tensor of current Tensor is computed.
+
+        The hook should not modify the input gradient Tensor, but it can optionally return
+        a new gradient Tensor which will be used in place of current Tensor's gradient.
+
+        The hook should have the following signature:
+
+            hook(grad) -> Tensor or None
+
+        Args:
+            hook(function): A backward hook to be registered for Tensor.grad
+
+        Returns:
+            TensorHookRemoveHelper: A helper object that can be used to remove the registered hook by calling `remove()` method.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                # hook function return None
+                def print_hook_fn(grad):
+                    print(grad)
+
+                # hook function return Tensor
+                def double_hook_fn(grad):
+                    grad = grad * 2
+                    return grad
+
+                x = paddle.to_tensor([0., 1., 2., 3.], stop_gradient=False)
+                y = paddle.to_tensor([4., 5., 6., 7.], stop_gradient=False)
+                z = paddle.to_tensor([1., 2., 3., 4.])
+
+                # one Tensor can register multiple hooks
+                h = x.register_hook(print_hook_fn)
+                x.register_hook(double_hook_fn)
+
+                w = x + y
+                # register hook by lambda function
+                w.register_hook(lambda grad: grad * 2)
+
+                o = z.matmul(w)
+                o.backward()
+                # print_hook_fn print content in backward
+                # Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+                #        [2., 4., 6., 8.])
+
+                print("w.grad:", w.grad) # w.grad: [1. 2. 3. 4.]
+                print("x.grad:", x.grad) # x.grad: [ 4.  8. 12. 16.]
+                print("y.grad:", y.grad) # y.grad: [2. 4. 6. 8.]
+
+                # remove hook
+                h.remove()
+        """
+        if self.stop_gradient is True:
+            raise RuntimeError(
+                "Cannot register hook on a tensor that stop gradient.")
+
+        hook_id = self._register_grad_hook(hook)
+        helper = TensorHookRemoveHelper(self, hook_id)
+        return helper
+
     @property
     def grad(self):
         """
-        The alias of gradient().
-        """
+        .. warning::
+          This API will return the tensor value of the gradient. If you want 
+          to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`.
 
-        return self.gradient()
+        Get the Gradient of Current Tensor.
+
+        Returns:
+            Tensor: the gradient of current Tensor
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                x = paddle.to_tensor(5., stop_gradient=False)
+                y = paddle.pow(x, 4.0)
+                y.backward()
+                print("grad of x: {}".format(x.grad))
+                # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, [500.])
+
+        """
+        msg = "tensor.grad will return the tensor value of the gradient."
+        warning_msg = "\033[93m\nWarning:\n%s \033[0m" % (msg)
+        warnings.warn(warning_msg)
+        return self._grad_ivar()
 
     def clear_grad(self):
         """
@@ -225,6 +379,49 @@ def monkey_patch_varbase():
         """
         self.clear_gradient()
 
+    def item(self, *args):
+        """
+        Convert one element Tensor to a Python scalar.
+
+        Args:
+            *args(int): The input coordinates. If it's single int, the data in the corresponding order of flattened Tensor will be returned.
+                Default: None, and it must be in the case where Tensor has only one element.
+
+        Returns(Python scalar): A Python scalar, whose dtype is corresponds to the dtype of Tensor.
+
+        Raises:
+            ValueError: If the Tensor has more than one element, there must be coordinates.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                x = paddle.to_tensor(1)
+                print(x.item())             #1
+                print(type(x.item()))       #<class 'int'>
+
+                x = paddle.to_tensor(1.0)
+                print(x.item())             #1.0
+                print(type(x.item()))       #<class 'float'>
+
+                x = paddle.to_tensor(True)
+                print(x.item())             #True
+                print(type(x.item()))       #<class 'bool'>
+
+                x = paddle.to_tensor(1+1j)
+                print(x.item())             #(1+1j)
+                print(type(x.item()))       #<class 'complex'>
+
+                x = paddle.to_tensor([[1.1, 2.2, 3.3]])
+                print(x.item(2))            #3.3
+                print(x.item(0, 2))         #3.3
+
+                x = paddle.to_tensor([1, 2])
+                x.item()               #ValueError: only one element tensor can be converted to Python scalar when no input coordinates.
+        """
+        return self._getitem_from_offset(*args).item()
+
     @property
     def inplace_version(self):
         """
@@ -311,14 +508,67 @@ def monkey_patch_varbase():
     def __bool__(self):
         return self.__nonzero__()
 
+    def __array__(self, dtype=None):
+        """
+        Returns a numpy array shows the value of current Tensor.
+        
+        Returns:
+            ndarray: The numpy value of current Tensor.
+
+        Returns type:
+            ndarray: dtype is same as current Tensor
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                x = paddle.randn([2, 2])
+                x_array = np.array(x)
+
+                print(type(x_array))      #<class 'numpy.ndarray'>
+                print(x_array.shape)      #(2, 2)
+        """
+        array = self.numpy()
+        if dtype:
+            array = array.astype(dtype)
+        return array
+
+    def __getitem__(self, item):
+        def contain_tensor(item):
+            if not isinstance(item, tuple):
+                item = [item]
+
+            for slice_item in item:
+                if isinstance(slice_item, slice):
+                    if isinstance(slice_item.start, Variable)  \
+                        or isinstance(slice_item.stop, Variable) \
+                           or isinstance(slice_item.step, Variable):
+                        return True
+                else:
+                    if isinstance(slice_item, Variable):
+                        return True
+            return False
+
+        if contain_tensor(item):
+            # 1. Call _getitem_impl_ when item contains tensor.
+            # Why not call a c++ function ? Because item can't be parsed when it contains tensor.
+            return _getitem_impl_(self, item)
+
+        else:
+            # 2. Call c++ func getitem_index_not_tensor to speedup.
+            return self._getitem_index_not_tensor(item)
+
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
         ("_to_static_var", _to_static_var), ("set_value", set_value),
         ("block", block), ("backward", backward), ("clear_grad", clear_grad),
         ("inplace_version", inplace_version), ("grad", grad),
-        ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__),
+        ("gradient", gradient), ("register_hook", register_hook),
+        ("__str__", __str__), ("__repr__", __str__),
         ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
-        ("__name__", "Tensor")):
+        ("__name__", "Tensor"), ("__array__", __array__),
+        ("__getitem__", __getitem__), ("item", item)):
         setattr(core.VarBase, method_name, method)
 
     # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 9b0b04a6ea7164501ff5af6caace2bd4a6315cf4..620729795bc20afe94e7b9973e9d74dd5743d050 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1213,6 +1213,7 @@ class Executor(object):
             # In distributed training, the compiled program is saved in Program._graph
             has_compiled_graph = isinstance(program._graph,
                                             compiler.CompiledProgram)
+
             if has_compiled_graph:
                 program._graph._compile(scope, self.place)
                 # _graph in program does not support inference since the _graph is optimized
@@ -1372,11 +1373,14 @@ class Executor(object):
                          fetch_info=None,
                          print_period=100):
         is_heter = 0
+        use_ps_gpu = 0
         if not program._fleet_opt is None:
             if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker":
                 is_heter = 1
             if program._fleet_opt.get("trainer", "") == "HeterXpuTrainer":
                 is_heter = 1
+            if program._fleet_opt.get("use_ps_gpu", False):
+                use_ps_gpu = True
         if scope is None:
             scope = global_scope()
         if fetch_list is None:
@@ -1411,7 +1415,9 @@ class Executor(object):
             trainer._set_program(program.program)
 
         if thread <= 0:
-            if dataset.thread_num <= 0:
+            if use_ps_gpu:
+                trainer._set_thread(len(program._fleet_opt["worker_places"]))
+            elif dataset.thread_num <= 0:
                 raise RuntimeError(
                     "You should set thread num first, either in Dataset"
                     "or in Executor.train_from_dataset")
@@ -1445,8 +1451,12 @@ class Executor(object):
             for var in program.global_block().vars.values():
                 if var.is_data:
                     data_vars.append(var)
-            dataset = paddle.fluid.DatasetFactory().create_dataset(
-                'FileInstantDataset')
+            if core.is_compiled_with_npu():
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'InMemoryDataset')
+            else:
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'FileInstantDataset')
             dataset.set_batch_size(1)
             dataset.set_thread(1)
             dataset.set_filelist(['None'])
@@ -1458,7 +1468,7 @@ class Executor(object):
         dataset._prepare_to_run()
         real_fetch_list = []
         if program._pipeline_opt:
-            real_program = program._pipeline_opt["section_program"]['program']
+            real_program = program._pipeline_opt["section_program"]
             for fetch_var in fetch_list:
                 if isinstance(fetch_var, Variable):
                     fetch_var_name = fetch_var.name
@@ -1467,13 +1477,20 @@ class Executor(object):
                 if fetch_var_name in real_program.global_block().vars:
                     real_fetch_list.append(fetch_var)
 
-            program._pipeline_opt["section_program"][
-                'program'] = self._add_feed_fetch_ops(
-                    program=program._pipeline_opt["section_program"]['program'],
-                    feed=[],
-                    fetch_list=real_fetch_list,
-                    feed_var_name='feed',
-                    fetch_var_name='fetch')
+            program._pipeline_opt["section_program"] = self._add_feed_fetch_ops(
+                program=program._pipeline_opt["section_program"],
+                feed=[],
+                fetch_list=real_fetch_list,
+                feed_var_name='feed',
+                fetch_var_name='fetch')
+            main_block = program._pipeline_opt["section_program"].block(0)
+            for op in main_block.ops:
+                # set the op_role of fetch op to Optimize to avoid
+                # erase the fetched vars by gc for pipeline
+                if op.type == 'fetch':
+                    op._set_attr(
+                        'op_role',
+                        core.op_proto_and_checker_maker.OpRole.Optimize)
             fetch_list = None
 
         scope, trainer = self._prepare_trainer(
@@ -1490,6 +1507,9 @@ class Executor(object):
         trainer._gen_trainer_desc()
 
         self._dump_debug_info(program=program, trainer=trainer)
+        # in case of calling _set_use_ps_gpu explicitly
+        if dataset.use_ps_gpu is False:
+            dataset._set_use_ps_gpu(trainer.proto_desc.use_ps_gpu)
         dataset._dynamic_adjust_before_train(trainer.proto_desc.thread_num)
 
         trainer_instance = self._default_executor.init_for_dataset(
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 04ed384846fb66da935ba9782a518b3c0707fa7b..2eac5adcf226c128fd705132251c1aef3f14e2f5 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -24,6 +24,7 @@ import re
 import traceback
 import six
 import copy
+from types import MethodType, FunctionType
 
 import numpy as np
 import subprocess
@@ -53,7 +54,6 @@ __all__ = [
     'is_compiled_with_cuda',
     'is_compiled_with_xpu',
     'Variable',
-    'load_op_library',
     'require_version',
     'device_guard',
     'set_flags',
@@ -246,11 +246,11 @@ def _static_only_(func):
 def _fake_interface_only_(func):
     def __impl__(*args, **kwargs):
         raise AssertionError(
-            "'%s' should be called by imperative Varible in imperative mode, please run it in dygraph "
-            "mode. You can turn off paddle.enable_static() if you are in static mode, or turn off "
-            "ProgramTranslator if you are using @paddle.jit.to_static. If you have to run ProgramTranslator, "
-            "please use other API to replace '%s'" % (func.__name__,
-                                                      func.__name__))
+            "'%s' only can be called by `paddle.Tensor` in dynamic graph mode. Suggestions:\n"
+            "  1. If you are in static graph mode, you can switch to dynamic graph mode by turning off `paddle.enable_static()` or calling `paddle.disable_static()`.\n"
+            "  2. If you are using `@paddle.jit.to_static`, you can turn off ProgramTranslator by calling `paddle.jit.ProgramTranslator().enable(False)`. "
+            "If you have to translate dynamic graph to static graph, please use other API to replace '%s'."
+            % (func.__name__, func.__name__))
 
     return __impl__
 
@@ -418,7 +418,7 @@ def cuda_places(device_ids=None):
     [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
 
     Parameters:
-        device_ids (list or tuple of int, optional): list of GPU device ids.
+        device_ids (list|tuple, optional): A list/tuple of int of GPU device ids.
 
     Returns:
         list of paddle.CUDAPlace: Created GPU place list.
@@ -429,6 +429,8 @@ def cuda_places(device_ids=None):
             import paddle
             import paddle.static as static
 
+            # required: gpu
+            
             paddle.enable_static()
 
             cuda_places = static.cuda_places()
@@ -877,7 +879,7 @@ def _getitem_impl_(var, item):
                 new_list_tensor.append(dim)
             else:
                 assert (isinstance(dim, int))
-                temp_out = var.block.create_var(dtype='int32')
+                temp_out = var.block.create_var(dtype='int64')
                 fill_constant([1], dim, force_cpu=True, out=temp_out)
                 new_list_tensor.append(temp_out)
         return new_list_tensor
@@ -1184,37 +1186,6 @@ class Variable(object):
         """
         pass
 
-    @fake_interface_only
-    def set_value(self, value):
-        """
-        **Notes**:
-            **This API is ONLY available in Dygraph mode**
-
-        Set a new value for this Variable.
-
-        Args:
-            value (Variable|np.ndarray): the new value.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import Linear
-                import numpy as np
-
-                data = np.ones([3, 1024], dtype='float32')
-                with fluid.dygraph.guard():
-                    linear = fluid.dygraph.Linear(1024, 4)
-                    t = to_variable(data)
-                    linear(t)  # call with default weight
-                    custom_weight = np.random.randn(1024, 4).astype("float32")
-                    linear.weight.set_value(custom_weight)  # change existing weight
-                    out = linear(t)  # call with different weight
-
-        """
-        pass
-
     @fake_interface_only
     def backward(self, retain_graph=False):
         """
@@ -1335,6 +1306,10 @@ class Variable(object):
         """
         pass
 
+    @fake_interface_only
+    def register_hook(self, hook):
+        pass
+
     def __str__(self):
         return self._to_readable_code()
 
@@ -1863,6 +1838,7 @@ class Variable(object):
         if not isinstance(item, tuple):
             item = [item]
 
+        decrease_axes = []
         axes = []
         starts = []
         ends = []
@@ -1933,15 +1909,23 @@ class Variable(object):
                 if end is None:
                     end = max_integer if step > 0 else (0 - max_integer)
             else:
+                decrease_axes.append(dim)
                 start = slice_item
                 end = slice_item + 1 if slice_item != -1 else max_integer
                 step = 1
+
             axes.append(dim)
             starts.append(start)
             ends.append(end)
             steps.append(step)
 
-        attrs = {'axes': axes, 'starts': starts, 'ends': ends, 'steps': steps}
+        attrs = {
+            'axes': axes,
+            'starts': starts,
+            'ends': ends,
+            'steps': steps,
+            'decrease_axes': decrease_axes
+        }
 
         from .layers import utils
         if utils._contain_var(starts):
@@ -1997,11 +1981,165 @@ class Variable(object):
                 "paddle.Tensor to a paddle.Tensor, but received {}".format(
                     type(value)))
 
-        self.block.append_op(
+        cur_block = default_main_program().current_block()
+        cur_block.append_op(
             type="set_value", inputs=inputs, outputs={'Out': self}, attrs=attrs)
 
         return self
 
+    def get_value(self, scope=None):
+        """
+        Get the value of variable in given scope. 
+
+        Args:
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
+                Default: None
+
+        Returns:
+            Tensor: the value in given scope.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static 
+                import numpy as np
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+
+                y = static.nn.fc(x, 10, name='fc')
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                prog = paddle.static.default_main_program()
+                exe.run(static.default_startup_program())
+                inputs = np.ones((10, 10), dtype='float32')
+                exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
+                path = 'temp/tensor_'
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t = var.get_value()
+                        paddle.save(t, path+var.name+'.pdtensor')
+
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t_load = paddle.load(path+var.name+'.pdtensor')
+                        var.set_value(t_load)
+        """
+        # The 'framework' is a low-level module, and 'executor' 
+        # can not be imported at the begainning of this file. 
+        # Therefore, the above two modules are dynamically imported.
+        from .executor import global_scope
+        if scope is not None and not isinstance(scope, core._Scope):
+            raise TypeError(
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".
+                format(type(scope)))
+
+        if scope is None:
+            scope = global_scope()
+        var_temp = scope.find_var(self.name)
+        if var_temp is None:
+            raise ValueError("Can not find Variable '{}' in the Scope.".format(
+                self.name))
+        t = var_temp.get_tensor()
+        return t
+
+    def set_value(self, value, scope=None):
+        '''
+        Set the value to the tensor in given scope. 
+
+        Args:
+            value(Tensor/ndarray) : The value to be set.
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
+                Default: None
+
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static 
+                import numpy as np
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+
+                y = static.nn.fc(x, 10, name='fc')
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                prog = paddle.static.default_main_program()
+                exe.run(static.default_startup_program())
+                inputs = np.ones((10, 10), dtype='float32')
+                exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
+                path = 'temp/tensor_'
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t = var.get_value()
+                        paddle.save(t, path+var.name+'.pdtensor')
+
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t_load = paddle.load(path+var.name+'.pdtensor')
+                        var.set_value(t_load)
+        '''
+
+        # The 'framework' is a low-level module, and 'executor'
+        # can not be imported at the begainning of this file. 
+        # Therefore, the above two modules are dynamically imported.
+        from .executor import global_scope
+
+        if not (isinstance(value, np.ndarray) or hasattr(value, '__array__')):
+            raise TypeError(
+                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}.".
+                format(type(value)))
+
+        if scope is not None and not isinstance(scope, core._Scope):
+            raise TypeError(
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".
+                format(type(scope)))
+
+        if scope is None:
+            scope = global_scope()
+
+        var_temp = scope.find_var(self.name)
+        if var_temp is None:
+            raise ValueError("Can not find Variable '{}' in the Scope.".format(
+                self.name))
+
+        t = var_temp.get_tensor()
+
+        if hasattr(value, 'shape'):
+            if isinstance(value.shape, (MethodType, FunctionType)):
+                value_shape = value.shape()
+            else:
+                value_shape = value.shape
+            if list(t.shape()) != list(value_shape):
+                raise ValueError(
+                    "{} expected a shape {}, but the received shape is {}.".
+                    format(self.name, list(t.shape()), list(value_shape)))
+
+        p = t._place()
+        if p.is_cpu_place():
+            place = core.CPUPlace()
+        elif p.is_cuda_pinned_place():
+            place = core.CUDAPinnedPlace()
+        elif p.is_xpu_place():
+            p = core.Place()
+            p.set_place(t._place())
+            place = core.XPUPlace(p.xpu_device_id())
+        else:
+            p = core.Place()
+            p.set_place(t._place())
+            place = core.CUDAPlace(p.gpu_device_id())
+
+        t.set(value, place)
+
 
 def get_all_op_protos():
     """
@@ -2122,7 +2260,8 @@ class Operator(object):
         'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
         'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
         'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
-        'c_wait_comm', 'c_wait_compute'
+        'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl',
+        'copy_cross_scope'
     }
 
     def __init__(self,
@@ -3031,7 +3170,11 @@ class Block(object):
                         # In startup_program, "c_broadcast" and "c_sync_comm_stream"
                         # are treated as initialization ops that cause error.
                         # Think of "c_broadcast" and "c_sync_comm_stream" as a special case here.
-                        if op.type in ["c_broadcast", "c_sync_comm_stream"]:
+                        # NOTE: "coalesce_tensor" is a special case for rnn with cudnn support
+                        if op.type in [
+                                "c_broadcast", "c_sync_comm_stream",
+                                "coalesce_tensor"
+                        ]:
                             continue
                         init_ops.append(op)
                 return init_ops
@@ -3103,10 +3246,7 @@ class Block(object):
             Operator: the insert Operator.
         """
         self._sync_with_cpp()
-        op_desc = self.desc._insert_op(index)
-        op = Operator(block=self, desc=op_desc, *args, **kwargs)
-        self.ops.insert(index, op)
-        return op
+        return self._insert_op_without_sync(index, *args, **kwargs)
 
     def _insert_op_without_sync(self, index, *args, **kwargs):
         """
@@ -4887,6 +5027,9 @@ class Program(object):
                 op = block.op(j)
                 if op.has_attr('is_test'):
                     op._set_attr('is_test', True)
+                if op.type() == "batch_norm":
+                    # Remove the output ReserveSpace of batch_norm if exists.
+                    op.remove_output("ReserveSpace")
         res.blocks = [
             Block(res, i) for i in six.moves.range(res.desc.num_blocks())
         ]
@@ -5306,6 +5449,173 @@ class Program(object):
             parameters.extend(each_block.all_parameters())
         return parameters
 
+    def state_dict(self, mode='all', scope=None):
+        """
+        Get parameters and persistable buffers of program as a dict. The key is the name of the parameter or the name of the buffer.
+        The value is the tensor of this variable in the given scope.
+
+        .. note::
+            This function MUST called after run start_up_program
+
+        Args:
+            mode(str, optional): Source of the obtained parameters and buffers. 
+                    'opt' :  The return value only contains the variable in the optimizer. 
+                    'param' : The return value only contains the variable in the network, not the variable in the optimizer.  
+                    'all' : The return value contains the variable in the network and optimizer.
+                    Default: 'all'
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
+                Default: None
+
+        Retruns:
+            dict: a dict contains the parameters and persistable buffers.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+                y = static.nn.fc(x, 10)
+                z = static.nn.fc(y, 10)
+
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                exe.run(static.default_startup_program())
+                prog = static.default_main_program()
+
+                path = "./temp/model.pdparams"
+                paddle.save(prog.state_dict(), path)
+        """
+        # The 'framework' is a low-level module, and 'executor'
+        # can not be imported at the begainning of this file. 
+        # Therefore, the above two modules are dynamically imported.
+        from .executor import global_scope
+        if scope is not None and not isinstance(scope, core._Scope):
+            raise TypeError(
+                "`scope` should be None or `paddle.static.Scope'` type, but received {}.".
+                format(type(scope)))
+
+        if scope is None:
+            scope = global_scope()
+
+        if not isinstance(mode, str):
+            raise TypeError("Type of `mode` should be string, but received {}.".
+                            format(type(mode)))
+
+        def is_parameter(var):
+            return isinstance(var, Parameter)
+
+        def is_persistable(var):
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        def is_belong_to_optimizer(var):
+            if not (isinstance(var, Parameter) or var.desc.need_check_feed()):
+                return is_persistable(var)
+            return False
+
+        def condition(var):
+
+            if mode == 'param':
+                return is_parameter(var)
+            elif mode == 'opt':
+                return is_belong_to_optimizer(var)
+            elif mode == 'all':
+                return is_parameter(var) or is_belong_to_optimizer(var)
+            else:
+                raise ValueError(
+                    "`mode` string should be 'param', 'opt' or 'all', but received {}.".
+                    format(mode))
+
+        var_list = filter(condition, self.list_vars())
+
+        state_dict = dict()
+        for var in var_list:
+            var_temp = scope.find_var(var.name)
+            if var_temp is None:
+                raise ValueError(
+                    "Can not find Variable '{}' in the scope. Make sure it is initialized".
+                    format(var.name))
+            state_dict[var.name] = var_temp.get_tensor()
+
+        return state_dict
+
+    def set_state_dict(self, state_dict, scope=None):
+        """
+        Set parameters and persistable buffers in state_dict to program. 
+        An exception will throw if shape or dtype of the parameters is not match.
+        
+        .. note::
+            This function MUST called after run start_up_program
+
+        Args:
+            state_dict(dict): the dict store parameters and persistable buffers. 
+                The key is the name of the parameter or the name of the buffer.
+                The value is the tensor of this variable in the given scope.
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
+                Default: None
+        
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+                y = static.nn.fc(x, 10)
+                z = static.nn.fc(y, 10)
+
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                exe.run(static.default_startup_program())
+                prog = static.default_main_program()
+
+                path = "./temp/model.pdparams"
+                paddle.save(prog.state_dict(), path)
+                state_dict_load = paddle.load(path)
+                prog.set_state_dict(state_dict_load)
+        """
+
+        if not isinstance(state_dict, dict):
+            raise TypeError(
+                "Type of `state_dict` should be dict, but received {}.".format(
+                    type(state_dict)))
+
+        vars_dict = {var.name: var for var in self.list_vars()}
+        condition = True if 'StructuredToParameterName@@' in state_dict else False
+        for name, value in state_dict.items():
+            if condition:
+                if name == "StructuredToParameterName@@":
+                    continue
+                if name in state_dict['StructuredToParameterName@@']:
+                    name = state_dict['StructuredToParameterName@@'][name]
+            if name in vars_dict:
+                try:
+                    vars_dict[name].set_value(value, scope)
+                except ValueError as err:
+                    warnings.warn(
+                        ("Skip loading for '{}'. ".format(name) + str(err)))
+                except TypeError as err:
+                    warnings.warn(
+                        ("Skip loading for '{}'. ".format(name) + str(err)))
+            else:
+                warnings.warn((
+                    "Skip loading for '{0}'. Because '{0}' not in the program.".
+                    format(name)))
+
 
 @six.add_metaclass(ParameterMetaClass)
 class Parameter(Variable):
@@ -5757,33 +6067,6 @@ def _dygraph_place_guard(place):
         _set_dygraph_tracer_expected_place(tmp_place)
 
 
-def load_op_library(lib_filename):
-    """
-    :api_attr: Static Graph
-
-    Load a dynamic library, including custom operators and kernels.
-    When library is loaded, ops and kernels registered in the library
-    will be available in PaddlePaddle main process.
-    Please note, the type of custom operators can't have the same type
-    with the existing operators in the framework.
-
-    Args:
-        lib_filename (str): name of dynamic library.
-    
-    Returns:
-        list[str]: new registered custom op names.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            #fluid.load_op_library('custom_op.so')
-
-    """
-    core.load_op_library(lib_filename)
-    return OpProtoHolder.instance().update_op_proto()
-
-
 def switch_device(device):
     global _current_device
     pre_device = _current_device
@@ -5800,7 +6083,8 @@ def device_guard(device=None):
     A context manager that specifies the device on which the OP will be placed.
 
     Args:
-        device(str|None): Specify the device to use in the context. It should be 'cpu' or 'gpu',
+        device(str|None): Specify the device to use in the context. It should be ``cpu``,
+            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs. 
             When it is set to 'cpu' or 'gpu', all OPs created in the context will be
             placed on CPUPlace or CUDAPlace. When 'gpu' is set and the program runs on
             single-card, the device index will be the same as the device on which the
@@ -5840,9 +6124,9 @@ def device_guard(device=None):
         device, index = device.split(':')
         if device == 'cpu':
             raise ValueError("Should not set device id for cpu.")
-    if device not in ['cpu', 'gpu', '', None]:
+    if device not in ['cpu', 'gpu', 'npu', '', None]:
         raise ValueError(
-            "The Attr(device) should be 'cpu' or 'gpu', and it can also be empty string or None "
+            "The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None "
             "when there is no need to specify device. But received %s" % device)
     if index:
         device = ":".join([device, index])
@@ -5925,7 +6209,7 @@ def _get_paddle_place(place):
     if place is None:
         return place
     if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
-                          core.CUDAPinnedPlace, core.CUDAPlace)):
+                          core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace)):
         return place
 
     if not isinstance(place, str):
@@ -5935,9 +6219,11 @@ def _get_paddle_place(place):
     place = place.lower()
     if (place == "cpu"):
         return core.CPUPlace()
+
     if (place == "device"):
         return core.Place()
 
+    # GPU
     avaliable_gpu_place = re.match(r'gpu:\d+', place)
     if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place:
         if not core.is_compiled_with_cuda():
@@ -5953,6 +6239,8 @@ def _get_paddle_place(place):
             device_id = place_info_list[1]
             device_id = int(device_id)
             return core.CUDAPlace(device_id)
+
+    # XPU
     avaliable_xpu_place = re.match(r'xpu:\d+', place)
     if avaliable_xpu_place:
         if not core.is_compiled_with_xpu():
@@ -5963,9 +6251,22 @@ def _get_paddle_place(place):
         device_id = place_info_list[1]
         device_id = int(device_id)
         return core.XPUPlace(device_id)
+
+    # NPU
+    avaliable_npu_place = re.match(r'npu:\d+', place)
+    if avaliable_npu_place:
+        if not core.is_compiled_with_npu():
+            raise ValueError(
+                "The device should not be {}, since PaddlePaddle is " \
+                "not compiled with NPU".format(avaliable_npu_place))
+        place_info_list = place.split(':', 1)
+        device_id = place_info_list[1]
+        device_id = int(device_id)
+        return core.NPUPlace(device_id)
+
     raise ValueError(
-        "paddle support CPUPlace, CUDAPlace,CUDAPinnedPlace and XPUPlace, Please check your Place Input"
-    )
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace and NPUPlace, but received {}.".
+        format(place))
 
 
 def _get_paddle_place_list(places):
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py
index 8d31a68e8083d6b34fa6f5e51be9391602da630b..7ff80039ae2e4ac18908cdc64145ac1cc7c96e5b 100644
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
@@ -22,7 +22,7 @@ class DataGenerator(object):
     """
     DataGenerator is a general Base class for user to inherit
     A user who wants to define his/her own python processing logic
-    with paddle.fluid.dataset should inherit this class.
+    with paddle.fluid.dataset should inherit this class
     """
 
     def __init__(self):
@@ -43,17 +43,14 @@ class DataGenerator(object):
         This is necessary only if a user wants to define generator_batch
         
         Example:
-
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             int_words = [int(x) for x in line.split()]
                             yield ("words", int_words)
                         return local_iter
-
                     def generate_batch(self, samples):
                         def local_iter():
                             for s in samples:
@@ -68,17 +65,14 @@ class DataGenerator(object):
         '''
         This function generator data from memory, it is usually used for
         debug and benchmarking
-
         Example:
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             yield ("words", [1, 2, 3, 4])
                         return local_iter
-
                 mydata = MyData()
                 mydata.run_from_memory()
         '''
@@ -105,22 +99,18 @@ class DataGenerator(object):
         process function with the _gen_str function. The parsed data will
         be wrote to stdout and the corresponding protofile will be
         generated.
-
         Example:
         
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             int_words = [int(x) for x in line.split()]
                             yield ("words", [int_words])
                         return local_iter
-
                 mydata = MyData()
                 mydata.run_from_stdin()
-
         '''
         batch_samples = []
         for line in sys.stdin:
@@ -144,10 +134,8 @@ class DataGenerator(object):
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the datafeed,and
         updating proto_info information.
-
         Args:
             line(str): the output of the process() function rewritten by user.
-
         Returns:
             Return a string data that can be read directly by the datafeed.
         '''
@@ -158,10 +146,8 @@ class DataGenerator(object):
         '''
         This function needs to be overridden by the user to process the 
         original data row into a list or tuple.
-
         Args:
             line(str): the original data row
-
         Returns:
             Returns the data processed by the user.
               The data format is list or tuple: 
@@ -171,24 +157,19 @@ class DataGenerator(object):
             For example:
             [("words", [1926, 08, 17]), ("label", [1])]
               or (("words", [1926, 08, 17]), ("label", [1]))
-
         Note:
             The type of feasigns must be in int or float. Once the float
             element appears in the feasign, the type of that slot will be
             processed into a float.
-
         Example:
-
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             int_words = [int(x) for x in line.split()]
                             yield ("words", [int_words])
                         return local_iter
-
         '''
         raise NotImplementedError(
             "Please rewrite this function to return a list or tuple: " +
@@ -201,25 +182,19 @@ class DataGenerator(object):
         It is usually used as batch processing when a user wants to
         do preprocessing on a batch of samples, e.g. padding according to
         the max length of a sample in the batch
-
         Args:
             samples(list tuple): generated sample from generate_sample
-
         Returns:
             a python generator, the same format as return value of generate_sample
-
         Example:
-
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             int_words = [int(x) for x in line.split()]
                             yield ("words", int_words)
                         return local_iter
-
                     def generate_batch(self, samples):
                         def local_iter():
                             for s in samples:
@@ -244,22 +219,18 @@ class MultiSlotStringDataGenerator(DataGenerator):
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the MultiSlotDataFeed,
         and updating proto_info information.
-
         The input line will be in this format:
             >>> [(name, [str(feasign), ...]), ...]
             >>> or ((name, [str(feasign), ...]), ...)
         The output will be in this format:
             >>> [ids_num id1 id2 ...] ...
-
         For example, if the input is like this:
             >>> [("words", ["1926", "08", "17"]), ("label", ["1"])]
             >>> or (("words", ["1926", "08", "17"]), ("label", ["1"]))
         the output will be:
             >>> 3 1234 2345 3456 1 1
-
         Args:
             line(str): the output of the process() function rewritten by user.
-
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
@@ -285,7 +256,6 @@ class MultiSlotDataGenerator(DataGenerator):
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the MultiSlotDataFeed,
         and updating proto_info information.
-
         The input line will be in this format:
             >>> [(name, [feasign, ...]), ...] 
             >>> or ((name, [feasign, ...]), ...)
@@ -301,10 +271,8 @@ class MultiSlotDataGenerator(DataGenerator):
             >>> 3 1234 2345 3456 1 1
         the proto_info will be:
             >>> [("words", "uint64"), ("label", "uint64")]
-
         Args:
             line(str): the output of the process() function rewritten by user.
-
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
diff --git a/python/paddle/fluid/incubate/data_generator/test_data_generator.py b/python/paddle/fluid/incubate/data_generator/test_data_generator.py
deleted file mode 100644
index dcacd67e92a8886c17cfb09b1aa18e0e48f6b605..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/data_generator/test_data_generator.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-from __init__ import *
-
-
-class SyntheticData(MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", [1, 2, 3, 4]), ("label", [0])
-
-        return data_iter
-
-
-class SyntheticStringData(MultiSlotStringDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", ["1", "2", "3", "4"], ("label", ["0"]))
-
-
-sd = SyntheticData()
-sd.run_from_memory()
-
-sd2 = SyntheticStringData()
-sd.run_from_memory()
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
index 35029a3dfc7e70575f66e49d845ec7b51b65f470..2a9d26daaed90120c782ace98a09b2aaee1a1c68 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
@@ -149,6 +149,7 @@ class DistributedStrategy(object):
         if num_threads > 1:
             self._build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         self.debug_opt = None
+        self.use_ps_gpu = False
 
     def set_debug_opt(self, opt_info):
         self.debug_opt = opt_info
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index b987e01bba46ec1dff516381bc534d384b928c9e..b2735727f6755b635a6b55fb54c7f0a739ed79be 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -31,7 +31,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundR
 from paddle.fluid.transpiler.details.program_utils import delete_ops
 
 OP_NAME_SCOPE = "op_namescope"
-CLIP_OP_NAME_SCOPE = "@CLIP"
+CLIP_OP_NAME_SCOPE = "gradient_clip"
 STEP_COUNTER = "@PS_STEP_COUNTER@"
 LEARNING_RATE_DECAY_COUNTER = "@LR_DECAY_COUNTER@"
 
@@ -138,6 +138,7 @@ class CompileTimeStrategy(object):
 
         self.strategy = strategy
         self.role_maker = role_maker
+        self.use_ps_gpu = False
         try:
             self.is_heter_ps_mode = role_maker._is_heter_parameter_server_mode
         except:
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 2292d4c0a4d6f311976eb040dbc7e1a003c8d07a..d4af3e2f8042a5dd83e5d4ba06cb3c89b352f8ed 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -24,6 +24,7 @@ from functools import reduce
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
+import paddle.compat as cpt
 
 from paddle.fluid.transpiler.details.program_utils import delete_ops
 from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
@@ -32,7 +33,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_ta
 from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
 
 OP_NAME_SCOPE = "op_namescope"
-CLIP_OP_NAME_SCOPE = "@CLIP"
+CLIP_OP_NAME_SCOPE = "gradient_clip"
 STEP_COUNTER = "@PS_STEP_COUNTER@"
 OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
 RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -93,7 +94,7 @@ def delete_optimizer_pass(program, config):
     return program
 
 
-def distributed_ops_pass(program, config):
+def distributed_ops_pass(program, config, use_ps_gpu=False):
     trainer_id = config.get_role_id()
     send_ctx = config.get_the_one_send_context(
         split_dense_table=config.is_heter_ps_mode)
@@ -109,7 +110,7 @@ def distributed_ops_pass(program, config):
                 pull_sparse_ops[param_name] = ops
         return pull_sparse_ops
 
-    def _pull_sparse_fuse(_program, pull_sparse_ops):
+    def _pull_sparse_fuse(_program, pull_sparse_ops, use_ps_gpu):
         for param, ops in pull_sparse_ops.items():
             all_ops = program.global_block().ops
             op_idxs = [all_ops.index(op) for op in ops]
@@ -159,18 +160,31 @@ def distributed_ops_pass(program, config):
             if min(outputs_idxs) - max(inputs_idxs) >= 1:
                 distributed_idx = max(inputs_idxs) + 1
 
-                program.global_block()._insert_op(
-                    index=distributed_idx,
-                    type="distributed_lookup_table",
-                    inputs={"Ids": inputs,
-                            'W': w},
-                    outputs={"Outputs": outputs},
-                    attrs={
-                        "is_distributed": is_distributed,
-                        "padding_idx": padding_idx,
-                        "table_id": table_id,
-                        "lookup_table_version": op_type
-                    })
+                if use_ps_gpu:
+                    program.global_block()._insert_op(
+                        index=distributed_idx,
+                        type="pull_box_sparse",
+                        inputs={"Ids": inputs,
+                                'W': w},
+                        outputs={"Out": outputs},
+                        attrs={
+                            "size": w.shape[1],
+                            "is_distributed": True,
+                            "is_sparse": True
+                        })
+                else:
+                    program.global_block()._insert_op(
+                        index=distributed_idx,
+                        type="distributed_lookup_table",
+                        inputs={"Ids": inputs,
+                                'W': w},
+                        outputs={"Outputs": outputs},
+                        attrs={
+                            "is_distributed": is_distributed,
+                            "padding_idx": padding_idx,
+                            "table_id": table_id,
+                            "lookup_table_version": op_type
+                        })
             else:
                 for i in range(len(inputs_idxs)):
                     distributed_idx = op_idxs[i] + 1
@@ -189,7 +203,7 @@ def distributed_ops_pass(program, config):
                         })
 
     pull_sparse_ops = _get_pull_sparse_ops(program)
-    _pull_sparse_fuse(program, pull_sparse_ops)
+    _pull_sparse_fuse(program, pull_sparse_ops, use_ps_gpu)
     return program
 
 
@@ -308,6 +322,54 @@ def fake_init_ops_pass(program, config):
     return program
 
 
+def ps_gpu_pass(program):
+    def _add_push_box_sparse_op(program):
+        op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
+        backward = core.op_proto_and_checker_maker.OpRole.Backward
+        for op in program.global_block().ops:
+            if op.type != "pull_box_sparse":
+                continue
+            grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+                op.desc, cpt.to_text(set()), [])
+            for op_desc in grad_op_desc:
+                new_op_desc = program.global_block().desc.append_op()
+                new_op_desc.copy_from(op_desc)
+                new_op_desc._set_attr(op_role_attr_name, backward)
+
+    def _remove_lookup_table_grad_op_and_var(program):
+        lookup_table_grad_var = {}
+        remove_op_index = []
+        remove_var = []
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "lookup_table_grad":
+                for name in op.output("W@GRAD"):
+                    lookup_table_grad_var[name] = 1
+                    remove_op_index.append(idx)
+                    remove_var.append(name)
+                for name in op.input("W"):
+                    lookup_table_grad_var[name] = 1
+
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "pull_box_sparse":
+                continue
+            for key_name in op.input_names:
+                for var in op.input(key_name):
+                    if var in lookup_table_grad_var:
+                        remove_op_index.append(idx)
+                        break
+
+        remove_op_index = list(set(remove_op_index))
+        remove_op_index.sort(reverse=True)
+        for idx in remove_op_index:
+            program.global_block()._remove_op(idx)
+        for name in remove_var:
+            program.global_block()._remove_var(name)
+
+    _add_push_box_sparse_op(program)
+    _remove_lookup_table_grad_op_and_var(program)
+    return program
+
+
 def delet_extra_optimizes_pass(program, config):
     optimize_vars = []
     optimize_op_role_vars = []
@@ -465,7 +527,7 @@ def create_heter_program(program, config, heter_program, heter_ops,
     # This function mainly includes the following contents:
     # 1. For every heter block:
     #     a) copy heter device op from origin program
-    #     b) create variables which belong to heter op：
+    #     b) create variables which belong to heter op:
     #         -> if variable is persistable, clone it in global_scope
     #         -> if variable is temp, create it in heter block
     #     c) create communicate related op as follow:
diff --git a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh b/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
index 1df6b0618de8d7ab4a3ae1fee9490b56e990c5ce..cac2f7234bdf2f1acbf5af5ab02810acbda11be5 100644
--- a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
+++ b/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # start pserver0
 python fleet_deep_ctr.py \
     --role pserver \
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
deleted file mode 100644
index a9fd8ac74f428ed488b4be62f51ce7993d48e8e0..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.contrib.utils import HDFSClient
-import os
-import time
-
-
-def check_all_trainers_ready(ready_path, epoch):
-    trainer_num = fleet.worker_num()
-    trainer_id = fleet.worker_index()
-
-    hadoop_home = os.getenv("HADOOP_HOME")
-    configs = {
-        "fs.default.name": os.getenv("FS_NAME"),
-        "hadoop.job.ugi": os.getenv("FS_UGI")
-    }
-
-    node_ready = "ready.{}.{}.done".format(epoch, trainer_id)
-
-    with open(node_ready, "w") as node:
-        node.write("")
-
-    client = HDFSClient(hadoop_home, configs)
-    if not client.is_dir(ready_path):
-        client.makedirs(ready_path)
-    client.upload(
-        hdfs_path=ready_path,
-        local_path=node_ready,
-        overwrite=True,
-        retry_times=0)
-
-    print("PUT {} ON HDFS {} OK".format(node_ready, ready_path))
-
-    while True:
-        ready_num = len(client.ls(ready_path))
-        print("have {} trainers need to be ready".format(trainer_num - ready_num
-                                                         % trainer_num))
-        if ready_num % trainer_num == 0:
-            break
-        time.sleep(10)
-        ready_num = len(client.ls(ready_path))
-
-    print("All trainers are ready, continue training")
diff --git a/python/paddle/fluid/incubate/fleet/utils/utils.py b/python/paddle/fluid/incubate/fleet/utils/utils.py
index 79f3fb9193440a6bd058c835411ba3cd7ac00795..5cb4948a859d67c2d0dd8081d40418ae79fe4fd0 100644
--- a/python/paddle/fluid/incubate/fleet/utils/utils.py
+++ b/python/paddle/fluid/incubate/fleet/utils/utils.py
@@ -34,9 +34,12 @@ __all__ = [
     "graphviz"
 ]
 
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
 
 persistable_vars_out_fn = "vars_persistable.log"
 all_vars_out_fn = "vars_all.log"
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 86fab9811275fb520646f79e5aa6a70c9fd9e102..5b2010f340958059a37e3564e6d5f228be7c5a7b 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -152,8 +152,7 @@ class ConstantInitializer(Initializer):
             out_dtype = var.dtype
             out_var = var
 
-        # Initialization Ops should be prepended and not appended
-        op = block._prepend_op(
+        op = block.append_op(
             type="fill_constant",
             outputs={"Out": out_var},
             attrs={
@@ -238,10 +237,10 @@ class UniformInitializer(Initializer):
         block = self._check_block(block)
 
         assert isinstance(block, framework.Block)
-        check_variable_and_dtype(var, "Out", ["float16", "float32", "float64"],
+        check_variable_and_dtype(var, "Out",
+                                 ["uint16", "float16", "float32", "float64"],
                                  "uniform_random")
 
-        # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
 
@@ -259,7 +258,7 @@ class UniformInitializer(Initializer):
             out_dtype = var.dtype
             out_var = var
 
-        op = block._prepend_op(
+        op = block.append_op(
             type="uniform_random",
             inputs={},
             outputs={"Out": out_var},
@@ -330,14 +329,15 @@ class NormalInitializer(Initializer):
 
         assert isinstance(block, framework.Block)
 
-        check_variable_and_dtype(var, "Out", ["float16", "float32", "float64"],
+        check_variable_and_dtype(var, "Out",
+                                 ["uint16", "float16", "float32", "float64"],
                                  "guassian_random")
-        # Initialization Ops should be prepended and not appended
+
         if self._seed == 0:
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -350,7 +350,7 @@ class NormalInitializer(Initializer):
             out_dtype = var.dtype
             out_var = var
 
-        op = block._prepend_op(
+        op = block.append_op(
             type="gaussian_random",
             outputs={"Out": out_var},
             attrs={
@@ -363,7 +363,7 @@ class NormalInitializer(Initializer):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -416,12 +416,12 @@ class TruncatedNormalInitializer(Initializer):
 
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
-        # Initialization Ops should be prepended and not appended
+
         if self._seed == 0:
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -434,7 +434,7 @@ class TruncatedNormalInitializer(Initializer):
             out_dtype = var.dtype
             out_var = var
 
-        op = block._prepend_op(
+        op = block.append_op(
             type="truncated_gaussian_random",
             outputs={"Out": out_var},
             attrs={
@@ -446,7 +446,7 @@ class TruncatedNormalInitializer(Initializer):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -526,7 +526,8 @@ class XavierInitializer(Initializer):
         block = self._check_block(block)
 
         assert isinstance(block, framework.Block)
-        check_variable_and_dtype(var, "Out", ["float16", "float32", "float64"],
+        check_variable_and_dtype(var, "Out",
+                                 ["uint16", "float16", "float32", "float64"],
                                  "xavier_init")
 
         f_in, f_out = self._compute_fans(var)
@@ -539,7 +540,8 @@ class XavierInitializer(Initializer):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -554,7 +556,7 @@ class XavierInitializer(Initializer):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in + fan_out))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="uniform_random",
                 inputs={},
                 outputs={"Out": out_var},
@@ -569,7 +571,7 @@ class XavierInitializer(Initializer):
 
         else:
             std = np.sqrt(2.0 / float(fan_in + fan_out))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="gaussian_random",
                 outputs={"Out": out_var},
                 attrs={
@@ -581,7 +583,8 @@ class XavierInitializer(Initializer):
                 },
                 stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -670,7 +673,8 @@ class MSRAInitializer(Initializer):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -685,7 +689,7 @@ class MSRAInitializer(Initializer):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="uniform_random",
                 inputs={},
                 outputs={"Out": out_var},
@@ -700,7 +704,7 @@ class MSRAInitializer(Initializer):
 
         else:
             std = np.sqrt(2.0 / float(fan_in))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="gaussian_random",
                 outputs={"Out": out_var},
                 attrs={
@@ -712,7 +716,8 @@ class MSRAInitializer(Initializer):
                 },
                 stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -812,7 +817,9 @@ class BilinearInitializer(Initializer):
         weight = np.reshape(weight, shape)
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16 or var.dtype == VarDesc.VarType.FP64:
+        if var.dtype in [
+                VarDesc.VarType.FP16, VarDesc.VarType.BF16, VarDesc.VarType.FP64
+        ]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -842,7 +849,9 @@ class BilinearInitializer(Initializer):
                 value_name: values
             })
 
-        if var.dtype == VarDesc.VarType.FP16 or var.dtype == VarDesc.VarType.FP64:
+        if var.dtype in [
+                VarDesc.VarType.FP16, VarDesc.VarType.BF16, VarDesc.VarType.FP64
+        ]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -898,7 +907,7 @@ class NumpyArrayInitializer(Initializer):
         assert isinstance(block, framework.Block)
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             np_value = self._value.astype("float32")
             out_var = block.create_var(
@@ -913,7 +922,6 @@ class NumpyArrayInitializer(Initializer):
             out_dtype = var.dtype
             np_value = self._value
 
-        # Initialization Ops should be prepended and not appended
         if out_dtype == VarDesc.VarType.FP32:
             value_name = "fp32_values"
             values = [float(v) for v in np_value.flat]
@@ -925,7 +933,7 @@ class NumpyArrayInitializer(Initializer):
         if self._value.size > 1024 * 1024 * 1024:
             raise ValueError("The size of input is too big. Please consider "
                              "saving it to file and 'load_op' to load it")
-        op = block._prepend_op(
+        op = block.append_op(
             type='assign_value',
             outputs={'Out': out_var},
             attrs={
@@ -935,7 +943,7 @@ class NumpyArrayInitializer(Initializer):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 9cca3e16de5132bd602dac9d9ceb9013d95d2a36..30baa2aa26cda3be0ea05e1e55ae3c8999b33740 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1765,7 +1765,30 @@ def _pack_loaded_dict(load_obj):
 
 
 @static_only
-def save(program, model_path, pickle_protocol=2):
+def _legacy_save(param_dict, model_path, protocol=2):
+    def get_tensor(var):
+        if isinstance(var, core.VarBase):
+            return var.numpy()
+        elif isinstance(var, core.LoDTensor):
+            return np.array(var)
+        return var
+
+    param_dict = {name: get_tensor(param_dict[name]) for name in param_dict}
+
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(param_dict, protocol=protocol)
+        with open(model_path, 'wb') as f:
+            max_bytes = 2**30
+            for i in range(0, len(pickle_bytes), max_bytes):
+                f.write(pickle_bytes[i:i + max_bytes])
+    else:
+        with open(model_path, 'wb') as f:
+            pickle.dump(param_dict, f, protocol=protocol)
+
+
+@static_only
+def save(program, model_path, protocol=2, **configs):
     """
     :api_attr: Static Graph
 
@@ -1778,8 +1801,9 @@ def save(program, model_path, pickle_protocol=2):
     Args:
         program(Program) : The program to saved.
         model_path(str): the file prefix to save the program. The format is "dirname/file_prefix". If file_prefix is empty str. A exception will be raised
-        pickle_protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
+        protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 2
+        configs(dict, optional) : optional keyword arguments.                        
 
     Returns:
         None
@@ -1807,14 +1831,19 @@ def save(program, model_path, pickle_protocol=2):
     base_name = os.path.basename(model_path)
     assert base_name != "", \
         "The input model_path MUST be format of dirname/filename [dirname\\filename in Windows system], but received model_path is empty string."
+    if 'pickle_protocol' in configs:
+        protocol = configs['pickle_protocol']
+        warnings.warn(
+            "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+        )
 
-    if not isinstance(pickle_protocol, int):
+    if not isinstance(protocol, int):
         raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(pickle_protocol)))
+            type(protocol)))
 
-    if pickle_protocol < 2 or pickle_protocol > 4:
+    if protocol < 2 or protocol > 4:
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
-                         format(pickle_protocol))
+                         format(protocol))
 
     dir_name = os.path.dirname(model_path)
     if dir_name and not os.path.exists(dir_name):
@@ -1827,26 +1856,25 @@ def save(program, model_path, pickle_protocol=2):
     parameter_list = list(filter(is_parameter, program.list_vars()))
     param_dict = {p.name: get_tensor(p) for p in parameter_list}
 
-    param_dict = _unpack_saved_dict(param_dict, pickle_protocol)
+    param_dict = _unpack_saved_dict(param_dict, protocol)
 
-    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3.5/6'
-    if sys.platform == 'darwin' and sys.version_info.major == 3 and (
-            sys.version_info.minor == 5 or sys.version_info.minor == 6):
-        pickle_bytes = pickle.dumps(param_dict, protocol=pickle_protocol)
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(param_dict, protocol=protocol)
         with open(model_path + ".pdparams", 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
         with open(model_path + ".pdparams", 'wb') as f:
-            pickle.dump(param_dict, f, protocol=pickle_protocol)
+            pickle.dump(param_dict, f, protocol=protocol)
 
     optimizer_var_list = list(
         filter(is_belong_to_optimizer, program.list_vars()))
 
     opt_dict = {p.name: get_tensor(p) for p in optimizer_var_list}
     with open(model_path + ".pdopt", 'wb') as f:
-        pickle.dump(opt_dict, f, protocol=pickle_protocol)
+        pickle.dump(opt_dict, f, protocol=protocol)
 
     main_program = program.clone()
     program.desc.flush()
@@ -1857,6 +1885,17 @@ def save(program, model_path, pickle_protocol=2):
         f.write(program.desc.serialize_to_string())
 
 
+def _pickle_loads_mac(path, f):
+    pickle_bytes = bytearray(0)
+    file_size = os.path.getsize(path)
+    max_bytes = 2**30
+    for _ in range(0, file_size, max_bytes):
+        pickle_bytes += f.read(max_bytes)
+    load_result = pickle.loads(pickle_bytes) if six.PY2 else pickle.loads(
+        pickle_bytes, encoding='latin1')
+    return load_result
+
+
 @static_only
 def load(program, model_path, executor=None, var_list=None):
     """
@@ -1874,7 +1913,7 @@ def load(program, model_path, executor=None, var_list=None):
         model_path(str): The file prefix store the program
         executor(Executor, optional): The executor used for initialize the parameter
                                       When startup program is not run.
-        var_list(list, optional): The Tensor list to load single model file saved with
+        var_list(list|tuple, optional): The Tensor list/tuple to load single model file saved with
                                   [ save_params, save_persistables, save_vars ].
                                   Default: None
 
@@ -2002,6 +2041,10 @@ def load(program, model_path, executor=None, var_list=None):
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
             place = paddle.fluid.XPUPlace(p.xpu_device_id())
+        elif p.is_npu_place():
+            p = paddle.fluid.core.Place()
+            p.set_place(t._place())
+            place = paddle.fluid.NPUPlace(p.npu_device_id())
         else:
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
@@ -2016,8 +2059,13 @@ def load(program, model_path, executor=None, var_list=None):
                                                    global_scope(),
                                                    executor._default_executor)
     with open(parameter_file_name, 'rb') as f:
-        load_dict = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
+
+        # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+        if sys.platform == 'darwin' and sys.version_info.major == 3:
+            load_dict = _pickle_loads_mac(parameter_file_name, f)
+        else:
+            load_dict = pickle.load(f) if six.PY2 else pickle.load(
+                f, encoding='latin1')
         load_dict = _pack_loaded_dict(load_dict)
     for v in parameter_list:
         assert v.name in load_dict, \
@@ -2055,7 +2103,7 @@ def load_program_state(model_path, var_list=None):
 
     Args:
         model_path(str): The file prefix store the program
-        var_list(list, optional): The Tensor list to load saved with
+        var_list(list|tuple, optional): The Tensor list/tuple to load saved with
                                   [ save_params, save_persistables, save_vars ].
                                   Default: None.
                                   The var_list is only used to get name,
@@ -2196,8 +2244,12 @@ def load_program_state(model_path, var_list=None):
         "Parameter file [{}] not exits".format(parameter_file_name)
 
     with open(parameter_file_name, 'rb') as f:
-        para_dict = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
+        # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+        if sys.platform == 'darwin' and sys.version_info.major == 3:
+            para_dict = _pickle_loads_mac(parameter_file_name, f)
+        else:
+            para_dict = pickle.load(f) if six.PY2 else pickle.load(
+                f, encoding='latin1')
     para_dict = _pack_loaded_dict(para_dict)
 
     opt_file_name = model_prefix + ".pdopt"
@@ -2287,6 +2339,10 @@ def set_program_state(program, state_dict):
                 p = paddle.fluid.core.Place()
                 p.set_place(ten_place)
                 py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
+            elif ten_place.is_npu_place():
+                p = paddle.fluid.core.Place()
+                p.set_place(ten_place)
+                py_place = paddle.fluid.NPUPlace(p.npu_device_id())
 
             ten.set(new_para_np, py_place)
 
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 5ee46a68fb76e1911b0a7f73ba63f2d0d2ee1358..e9738b6660eeaf935fbdfac8ce1b8921df2c7b02 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -331,12 +331,14 @@ class LayerHelperBase(object):
             if isinstance(dtype, core.VarDesc.VarType):
                 if dtype != core.VarDesc.VarType.FP32 and \
                         dtype != core.VarDesc.VarType.FP64 and \
-                        dtype != core.VarDesc.VarType.FP16:
+                        dtype != core.VarDesc.VarType.FP16 and \
+                        dtype != core.VarDesc.VarType.BF16:
                     raise TypeError(
                         "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                     )
             else:
-                if not (dtype.startswith("float") or dtype == "double"):
+                if not (dtype.startswith("float") or
+                        dtype in ["double", "uint16"]):
                     raise TypeError(
                         "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                     )
@@ -379,7 +381,10 @@ class LayerHelperBase(object):
             return self.main_program.global_block().create_parameter(
                 dtype=dtype, shape=shape, type=type, **attr._to_kwargs())
 
-    def create_variable_for_type_inference(self, dtype, stop_gradient=False):
+    def create_variable_for_type_inference(self,
+                                           dtype,
+                                           stop_gradient=False,
+                                           shape=None):
         """Create a temporary variable that should be type inferred layer.
 
         Note:
@@ -395,6 +400,7 @@ class LayerHelperBase(object):
             name=unique_name.generate_with_ignorable_key(".".join(
                 [self.name, 'tmp'])),
             dtype=dtype,
+            shape=shape,
             type=core.VarDesc.VarType.LOD_TENSOR,
             persistable=False,
             stop_gradient=stop_gradient)
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 708692c215fb07831f69f54b8ec9d00dac1f6285..6e52ea04a195a41ff09495d210abd1fb74a0c6a1 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -25,7 +25,8 @@ from ..layer_helper import LayerHelper
 from ..data_feeder import check_variable_and_dtype
 
 __all__ = [
-    'generate_layer_fn', 'generate_activation_fn', 'autodoc', 'templatedoc'
+    'generate_layer_fn', 'generate_activation_fn', 'generate_inplace_fn',
+    'autodoc', 'templatedoc'
 ]
 
 
@@ -283,6 +284,35 @@ def generate_activation_fn(op_type):
     return func
 
 
+def generate_inplace_fn(inplace_op_type):
+    """Register the Python layer for an Inplace Operator without Attribute.
+
+    Args:
+       inplace_op_type: The name of the inplace operator to be created.
+
+    This function takes in the inplace operator type (exp_ , ceil_ etc) and
+    creates the operator functionality.
+    """
+    origin_op_type = inplace_op_type[:-1]
+
+    def func(x, name=None):
+        if in_dygraph_mode():
+            op = getattr(core.ops, inplace_op_type)
+            return op(x)
+        warnings.warn(
+            "In static mode, {}() is the same as {}() and does not perform inplace operation.".
+            format(inplace_op_type, origin_op_type))
+        return generate_activation_fn(origin_op_type)(x, name)
+
+    func.__name__ = inplace_op_type
+    func.__doc__ = """
+Inplace version of ``{0}`` API, the output Tensor will be inplaced with input ``x``.
+Please refer to :ref:`api_fluid_layers_{1}`.
+""".format(origin_op_type, origin_op_type)
+
+    return func
+
+
 def autodoc(comment=""):
     def __impl__(func):
         func.__doc__ = _generate_doc_string_(OpProtoHolder.instance(
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 96947bf72c7ddf299a4f4b372be3d62de4aaa1b5..a2dee91dbef7c04340585cdf602afb48d04bc468 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -22,6 +22,7 @@ from ..framework import Variable, unique_name
 from .layer_function_generator import OpProtoHolder
 
 _supported_int_dtype_ = [
+    core.VarDesc.VarType.BOOL,
     core.VarDesc.VarType.UINT8,
     core.VarDesc.VarType.INT8,
     core.VarDesc.VarType.INT16,
@@ -369,13 +370,7 @@ def monkey_patch_variable():
             setattr(Variable, method_name, method_impl)
     else:
         import paddle.tensor
-        variabel_methods = paddle.tensor.linalg.__all__ + \
-                           paddle.tensor.math.__all__ + \
-                           paddle.tensor.logic.__all__ + \
-                           paddle.tensor.manipulation.__all__ + \
-                           paddle.tensor.search.__all__ + \
-                           paddle.tensor.stat.__all__ + \
-                           paddle.tensor.attribute.__all__
+        variabel_methods = paddle.tensor.tensor_method_func
         for method_name in variabel_methods:
             if hasattr(Variable, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8d96e46f833e48847b6971b7587de19f5f500121..aa021c463bf3d70a27290f5251614ff477586051 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -332,7 +332,8 @@ def fc(input,
         for i, input_x in enumerate(input):
             check_type(input_x, 'input[' + str(i) + ']', Variable, 'fc')
     dtype = helper.input_dtype()
-    check_dtype(dtype, 'input', ['float16', 'float32', 'float64'], 'fc')
+    check_dtype(dtype, 'input', ['float16', 'uint16', 'float32', 'float64'],
+                'fc')
     mul_results = []
     for input_var, param_attr in helper.iter_inputs_and_params():
         input_shape = input_var.shape
@@ -491,7 +492,7 @@ def embedding(input,
     helper = LayerHelper('embedding', **locals())
     check_variable_and_dtype(input, 'input', ['int64'],
                              'fluid.layers.embedding')
-    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+    check_dtype(dtype, 'dtype', ['uint16', 'float16', 'float32', 'float64'],
                 'fluid.layers.embedding')
 
     if is_distributed:
@@ -1524,6 +1525,10 @@ def conv2d(input,
             not use_cudnn):
         l_type = 'depthwise_conv2d'
 
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            core.is_compiled_with_rocm()):
+        l_type = 'depthwise_conv2d'
+
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
 
@@ -1603,6 +1608,10 @@ def conv2d(input,
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
+    if (core.is_compiled_with_cuda() and paddle.fluid.get_flags(
+            "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+        use_cudnn = False
+
     helper.append_op(
         type=l_type,
         inputs={
@@ -6137,9 +6146,9 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
 
         return dygraph_utils._append_activation_in_dygraph(out, act)
 
-    check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64',
-                 'bool'], 'reshape')
+    check_variable_and_dtype(x, 'x', [
+        'float16', 'float32', 'float64', 'int32', 'int64', 'bool', 'uint16'
+    ], 'reshape')
     check_type(shape, 'shape', (list, tuple, Variable), 'reshape')
     check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape')
 
@@ -9256,6 +9265,9 @@ def affine_grid(theta, out_shape, name=None):
                                  'affine_grid')
     else:
         attrs['output_shape'] = out_shape
+    if core.is_compiled_with_rocm():
+        # ROCM platform do not have MIOPEN kernel for affine_grid
+        attrs['use_cudnn'] = False
 
     helper.append_op(
         type='affine_grid',
@@ -9507,8 +9519,8 @@ def pow(x, factor=1.0, name=None):
             y_2 = fluid.layers.pow(x, factor=factor_tensor)
             # y_2 is x^{3.0}
     """
-    check_variable_and_dtype(x, 'x', ['int32', 'int64', 'float32', 'float64'],
-                             'pow')
+    check_variable_and_dtype(
+        x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'pow')
 
     helper = LayerHelper('pow', **locals())
     inputs = {'X': x}
@@ -9933,7 +9945,7 @@ def flatten(x, axis=1, name=None):
 
     Args:
         x (Variable): A tensor of rank >= axis. A tensor with type float32,
-                      float64, int8, int32, int64.
+                      float64, int8, int32, int64, uint8.
         axis (int): Indicate up to which input dimensions (exclusive) should
                     be flattened to the outer dimension of the output.
                     The value for axis must be in the range [0, R], where R
@@ -9955,14 +9967,17 @@ def flatten(x, axis=1, name=None):
 
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
             x = fluid.data(name="x", shape=[4, 4, 3], dtype="float32")
             # x shape is [4, 4, 3]
             out = fluid.layers.flatten(x=x, axis=2)
             # out shape is [16, 3]
     """
     check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten')
+        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
+        'flatten')
     helper = LayerHelper('flatten', **locals())
 
     if not (isinstance(x, Variable)):
@@ -10318,7 +10333,8 @@ def expand(x, expand_times, name=None):
     inputs = {"X": [x]}
     attrs = {}
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand')
     check_type(expand_times, 'expand_times', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == True:
         raise ValueError(
@@ -10508,10 +10524,10 @@ def uniform_random_batch_size_like(input,
 
 
     """
-    check_variable_and_dtype(input, 'Input', ("float32", 'float64'),
+    check_variable_and_dtype(input, 'Input', ("float32", 'float64', "uint16"),
                              'uniform_random_batch_size_like')
     check_type(shape, 'shape', (list, tuple), 'uniform_random_batch_size_like')
-    check_dtype(dtype, 'dtype', ('float32', 'float64'),
+    check_dtype(dtype, 'dtype', ('float32', 'float64', "uint16"),
                 'uniform_random_batch_size_like')
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
@@ -11354,9 +11370,11 @@ def _elementwise_op(helper):
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], op_type)
+        x, 'x', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+        op_type)
     check_variable_and_dtype(
-        y, 'y', ['float16', 'float32', 'float64', 'int32', 'int64'], op_type)
+        y, 'y', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+        op_type)
 
     axis = helper.kwargs.get('axis', -1)
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
@@ -11428,8 +11446,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         return dygraph_utils._append_activation_in_dygraph(out)
 
     check_variable_and_dtype(x, "x", [
-        'float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64',
-        'uint8'
+        'float16', 'uint16', 'float32', 'float64', 'int8', 'int16', 'int32',
+        'int64', 'uint8'
     ], "scale")
     inputs = {'X': [x]}
     attrs = {
@@ -13011,7 +13029,10 @@ def grid_sampler(x, grid, name=None):
     out = helper.create_variable_for_type_inference(x.dtype)
     ipts = {'X': x, 'Grid': grid}
 
-    helper.append_op(type='grid_sampler', inputs=ipts, outputs={'Output': out})
+    attrs = {'use_cudnn': False} if core.is_compiled_with_rocm() else {}
+
+    helper.append_op(
+        type='grid_sampler', inputs=ipts, outputs={'Output': out}, attrs=attrs)
     return out
 
 
@@ -13334,7 +13355,7 @@ def shuffle_channel(x, group, name=None):
 
 
 @templatedoc()
-def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
+def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
     """
 
     **Temporal Shift Operator**
@@ -13348,6 +13369,8 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It can be "NCHW" or "NHWC". Default: "NCHW".
 
     Returns:
         out(Tensor): The temporal shifting result is a tensor with the
@@ -13365,6 +13388,13 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
             input = paddle.randn([6, 4, 2, 2])
             out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
+    if data_format not in ["NCHW", "NHWC"]:
+        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
+                         "Received Attr(data_format): {}.".format(data_format))
+    if in_dygraph_mode():
+        return core.ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
+                                       shift_ratio, 'data_format', data_format)
+
     helper = LayerHelper("temporal_shift", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
     check_type(seg_num, 'seg_num', int, 'temporal_shift')
@@ -13375,16 +13405,15 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None):
     if not isinstance(seg_num, int):
         raise TypeError("seg_num must be int type.")
 
-    if in_dygraph_mode():
-        return core.ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio',
-                                       shift_ratio)
-
     helper.append_op(
         type="temporal_shift",
         inputs={"X": x},
         outputs={"Out": out},
-        attrs={"seg_num": seg_num,
-               "shift_ratio": shift_ratio})
+        attrs={
+            "seg_num": seg_num,
+            "shift_ratio": shift_ratio,
+            "data_format": data_format
+        })
     return out
 
 
@@ -15095,7 +15124,8 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
                                        float(max), 'seed', seed, 'dtype', dtype)
 
     check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
-    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64', 'uint16'),
+                'uniform_random/rand')
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 841daf7a41d1fa6a84feba3632328c68723fba3f..813f671e02070659da0ee83734b594b5163499a1 100755
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 import os
-from .layer_function_generator import generate_layer_fn, generate_activation_fn, add_sample_code
+from .layer_function_generator import generate_layer_fn, generate_activation_fn, generate_inplace_fn, add_sample_code
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_, Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
@@ -27,6 +27,7 @@ __deprecated_func_name__ = {
 
 __activations_noattr__ = [
     'sigmoid',
+    'silu',
     'logsigmoid',
     'tanh_shrink',
     'softplus',
@@ -54,6 +55,16 @@ __unary_func__ = [
     'square',
 ]
 
+__inplace_unary_func__ = [
+    'exp_',
+    'sqrt_',
+    'rsqrt_',
+    'ceil_',
+    'floor_',
+    'round_',
+    'reciprocal_',
+]
+
 __all__ = []
 
 for _OP in set(__all__):
@@ -68,6 +79,7 @@ globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
 
 __all__ += __activations_noattr__
 __all__ += __unary_func__
+__all__ += __inplace_unary_func__
 
 for _OP in set(__activations_noattr__):
     _new_OP = _OP
@@ -86,6 +98,14 @@ for _OP in set(__unary_func__):
     func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(func)
     globals()[_OP] = func
 
+for _OP in set(__inplace_unary_func__):
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
+    func = generate_inplace_fn(_OP)
+    func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(func)
+    globals()[_OP] = func
+
 add_sample_code(globals()["sigmoid"], r"""
 Examples:
     .. code-block:: python
@@ -100,6 +120,20 @@ Examples:
 
 """)
 
+add_sample_code(globals()["silu"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+        out = F.silu(x)
+        print(out)
+        # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ]
+
+""")
+
 add_sample_code(globals()["logsigmoid"], r"""
 Examples:
     .. code-block:: python
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index df1113660f7d8d2755d56a377b6770da99afc62a..a42ec2c92a3aad185fcd2eefd0eaf1e8147a90dc 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -139,10 +139,11 @@ def sequence_conv(input,
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
+             import paddle
+             paddle.enable_static()
 
-             x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-             x_conved = fluid.layers.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
+             x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+             x_conved = paddle.static.nn.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
     """
 
     assert not in_dygraph_mode(), (
@@ -233,15 +234,17 @@ def sequence_softmax(input, use_cudnn=False, name=None):
     Examples:
 
         .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[7, 1],
+             
+             import paddle
+             paddle.enable_static()
+             
+             x = paddle.static.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
-             x_sequence_softmax_1 = fluid.layers.sequence_softmax(input=x)  
+             x_sequence_softmax_1 = paddle.static.nn.sequence_softmax(input=x)  
 
-             y = fluid.data(name='y', shape=[7],
+             y = paddle.static.data(name='y', shape=[7],
                  dtype='float32', lod_level=1)
-             x_sequence_softmax_2 = fluid.layers.sequence_softmax(input=y)  
+             x_sequence_softmax_2 = paddle.static.nn.sequence_softmax(input=y)  
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -334,15 +337,16 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
-            sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
-            sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
-            max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
-            last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
-            first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
+            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            avg_x = paddle.static.nn.sequence_pool(input=x, pool_type='average')
+            sum_x = paddle.static.nn.sequence_pool(input=x, pool_type='sum')
+            sqrt_x = paddle.static.nn.sequence_pool(input=x, pool_type='sqrt')
+            max_x = paddle.static.nn.sequence_pool(input=x, pool_type='max')
+            last_x = paddle.static.nn.sequence_pool(input=x, pool_type='last')
+            first_x = paddle.static.nn.sequence_pool(input=x, pool_type='first')
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -413,10 +417,12 @@ def sequence_concat(input, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-            y = fluid.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
-            out = fluid.layers.sequence_concat(input=[x, y])
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+            y = paddle.static.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
+            out = paddle.static.nn.sequence_concat(input=[x, y])
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -481,9 +487,11 @@ def sequence_first_step(input):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_first_step = fluid.layers.sequence_first_step(input=x)
+             import paddle
+             paddle.enable_static()
+
+             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_first_step = paddle.static.nn.sequence_first_step(input=x)
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'sequence_first_step')
@@ -538,9 +546,11 @@ def sequence_last_step(input):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_last_step = fluid.layers.sequence_last_step(input=x)
+             import paddle
+             paddle.enable_static()
+             
+             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_last_step = paddle.static.nn.sequence_last_step(input=x)
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'sequence_last_step')
@@ -598,13 +608,15 @@ def sequence_slice(input, offset, length, name=None):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
+             import paddle
+             paddle.enable_static()
+             
              import numpy as np
-             seqs = fluid.data(name='x', shape=[10, 5],
+             seqs = paddle.static.data(name='x', shape=[10, 5],
                               dtype='float32', lod_level=1)
-             offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
-             length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
-             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
+             offset = paddle.assign(np.array([[0, 1]]).astype("int32"))
+             length = paddle.assign(np.array([[2, 1]]).astype("int32"))
+             subseqs = paddle.static.nn.sequence_slice(input=seqs, offset=offset,
                                                    length=length)
     """
     assert not in_dygraph_mode(), (
@@ -715,17 +727,18 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     Examples:
         .. code-block:: python
 	
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+            from paddle import fluid
+            paddle.enable_static()
             import numpy as np
 
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1],
+            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            y = paddle.static.data(name='y', shape=[8, 1],
                         dtype='float32', lod_level=1)
-            out = layers.sequence_expand(x=x, y=y, ref_level=0)
+            out = paddle.static.nn.sequence_expand(x=x, y=y, ref_level=0)
 
-            exe = fluid.Executor(fluid.CPUPlace())
-            place = fluid.CPUPlace()
+            exe = paddle.static.Executor(fluid.CPUPlace())
+            place = paddle.CPUPlace()
 
             np_data = np.array([[1], [2], [3], [4]]).astype('float32')
             x_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2]], place)
@@ -836,13 +849,14 @@ def sequence_expand_as(x, y, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            paddle.enable_static()
             import numpy as np
 
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
-            out = layers.sequence_expand_as(x=x, y=y)
+            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            y = paddle.static.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
+            out = paddle.static.nn.sequence_expand_as(x=x, y=y)
 
             exe = fluid.Executor(fluid.CPUPlace())
             place = fluid.CPUPlace()
@@ -969,13 +983,15 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.fluid as fluid
             import numpy
 
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(
-                input=numpy.array([0.0], dtype=numpy.float32))
-            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = paddle.assign(
+                numpy.array([0.0], dtype=numpy.float32))
+            out = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
     """
 
     assert not in_dygraph_mode(), (
@@ -1048,16 +1064,18 @@ def sequence_unpad(x, length, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.fluid as fluid
             import numpy
 
             # pad data
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(input=numpy.array([0.0], dtype=numpy.float32))
-            pad_data, len = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = paddle.assign(numpy.array([0.0], dtype=numpy.float32))
+            pad_data, len = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
             
             # unpad data
-            unpad_data = fluid.layers.sequence_unpad(x=pad_data, length=len)
+            unpad_data = paddle.static.nn.sequence_unpad(x=pad_data, length=len)
     """
 
     assert not in_dygraph_mode(), (
@@ -1123,9 +1141,11 @@ def sequence_reshape(input, new_dim):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
-            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=4)
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
+            x_reshaped = paddle.static.nn.sequence_reshape(input=x, new_dim=4)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -1200,12 +1220,13 @@ def sequence_scatter(input, index, updates, name=None):
 
         .. code-block:: python
 	
-            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
-            input = fluid.data( name="x", shape=[None, 3, 6], dtype='float32' )
-            index = fluid.data( name='index', shape=[12, 1],  dtype='int64', lod_level=1)
-            updates = fluid.data( name='updates', shape=[12, 1], dtype='float32', lod_level=1)
-            output = fluid.layers.sequence_scatter(input, index, updates)
+            input = paddle.static.data(name="x", shape=[None, 3, 6], dtype='float32' )
+            index = paddle.static.data(name='index', shape=[12, 1],  dtype='int64', lod_level=1)
+            updates = paddle.static.data(name='updates', shape=[12, 1], dtype='float32', lod_level=1)
+            output = paddle.static.nn.sequence_scatter(input, index, updates)
 
     """
     assert not in_dygraph_mode(), (
@@ -1279,10 +1300,11 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            x = fluid.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
-            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
+            import paddle
+            paddle.enable_static()
+            
+            x = paddle.static.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
+            out = paddle.static.nn.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -1333,26 +1355,30 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
             Tensor or LodTensor with shape [d_1, d_2, ..., d_n].
         maxlen (int, optional): Maximum length of the sequence. If :code:`maxlen` \
                            is None, it would be replace with :math:`max(x)`.
-        dtype (np.dtype|core.VarDesc.VarType|str, optional): Data type of the output, \
+        dtype (np.dtype|paddle.dtype|str, optional): Data type of the output, \
              ``int64`` by default.
         name(str, optional): For detailed information, please refer \
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: The output sequence mask. Tensor or LodTensor with shape [d_1, d_2, ..., d_n, maxlen] \
-            and data type of :code:`dtype`. The data type should be float32, float64, int8, \
+    Returns: The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \
+            and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \
             int32 or int64.
 
-    Return Type: Variable
+    Return Type: Tensor
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+
+            lengths = paddle.to_tensor([10, 9, 8])
+            mask = paddle.nn.functional.sequence_mask(lengths)
 
-            x = fluid.data(name='x', shape=[10], dtype='float32', lod_level=1)
-            mask = layers.sequence_mask(x=x)
+            print(mask.numpy())
+            # [[1 1 1 1 1 1 1 1 1 1]
+            #  [1 1 1 1 1 1 1 1 1 0]
+            #  [1 1 1 1 1 1 1 1 0 0]]
 
     """
     helper = LayerHelper('sequence_mask', **locals())
@@ -1414,9 +1440,11 @@ def sequence_reverse(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            x_reversed = fluid.layers.sequence_reverse(x)
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            x_reversed = paddle.static.nn.sequence_reverse(x)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 84f99962e843072b41be57d329900729659d71c7..c0c07f593a3ed2f611c12982300e51ce1ff69664 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import math
 import numpy
 import six
 import warnings
@@ -147,7 +148,7 @@ def create_global_var(shape,
     This function creates a new tensor variable with value in the global block(block 0).
 
     Parameters:
-        shape (list of int): Shape of the variable
+        shape (list[int]|tuple[int]): Shape of the variable
         value (float): The value of the variable. The new created
                       variable will be filled with it.
         dtype (str): Data type of the variable
@@ -230,13 +231,13 @@ def cast(x, dtype):
         out = core.ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return out
 
-    check_variable_and_dtype(
-        x, 'x',
-        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
-        'cast')
+    check_variable_and_dtype(x, 'x', [
+        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'uint8',
+        'uint16'
+    ], 'cast')
     check_dtype(dtype, 'dtype', [
         'bool', 'float16', 'float32', 'float64', 'int8', 'int32', 'int64',
-        'uint8'
+        'uint8', 'uint16'
     ], 'cast')
 
     helper = LayerHelper('cast', **locals())
@@ -546,8 +547,10 @@ def assign(input, output=None):
     The OP copies the :attr:`input` to the :attr:`output`.
 
     Parameters:
-        input (Tensor|numpy.ndarray): A tensor or numpy ndarray, its data type supports
-            float16, float32, float64, int32 and int64.
+        input (Tensor|numpy.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar,
+            or scalar. Its data type supports float16, float32, float64, int32, int64, and bool.
+            Note: the float64 data will be converted to float32 because of current platform protobuf
+            data limitation.
         output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
             be created as :attr:`output`. Default: None.
 
@@ -569,14 +572,19 @@ def assign(input, output=None):
           result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
     helper = LayerHelper('assign', **locals())
-    check_type(input, 'input', (Variable, numpy.ndarray), 'assign')
+    check_type(input, 'input', (Variable, numpy.ndarray, list, tuple, float,
+                                int, bool), 'assign')
     is_inplace = True if output is not None else False
 
+    if numpy.isscalar(input) and not isinstance(input, str):
+        input = numpy.array([input])
+    elif isinstance(input, (list, tuple)):
+        input = numpy.array(input)
+
     if isinstance(input, Variable):
-        check_dtype(
-            input.dtype, 'input',
-            ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
-            'assign', '(When the type of input in assign is Variable.)')
+        check_dtype(input.dtype, 'input', [
+            'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', 'bool'
+        ], 'assign', '(When the type of input in assign is Variable.)')
         if output is None:
             output = helper.create_variable_for_type_inference(
                 dtype=input.dtype)
@@ -584,6 +592,14 @@ def assign(input, output=None):
             type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
     elif isinstance(input, numpy.ndarray):
         dtype = convert_np_dtype_to_dtype_(input.dtype)
+        if dtype == VarDesc.VarType.FP64:
+            # Setting FP64 numpy data is not supported in Paddle, so we
+            # use FP32 here
+            warnings.warn(
+                "paddle.assign doesn't support float64 input now due "
+                "to current platform protobuf data limitation, we convert "
+                "it to float32")
+            dtype = VarDesc.VarType.FP32
         if dtype == VarDesc.VarType.BOOL:
             value_name = "bool_values"
             values = [bool(v) for v in input.flat]
@@ -635,7 +651,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
             If ``shape`` is an Tensor, it should be an 1-D Tensor with date type int32 or int64.
         dtype(np.dtype|str): Data type of the output Tensor which can
-            be float16, float32, float64, int32, int64.
+            be float16, float32, float64, uint8, int32, int64.
         value(bool|float|int|Tensor): The constant value used to initialize 
             the Tensor to be created. If ``value`` is an Tensor, it should be an 1-D Tensor.
         force_cpu(bool, optional): data should be on CPU if it's true, default value is False.
@@ -673,7 +689,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     attrs = {'force_cpu': force_cpu}
     dtype = convert_dtype(dtype)
     if not isinstance(value, Variable):
-        if dtype in ['int64', 'int32']:
+        if dtype in ['uint8', 'int64', 'int32']:
             attrs['str_value'] = str(int(value))
             attrs['value'] = int(value)
         else:
@@ -686,7 +702,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             out = _varbase_creator(dtype=dtype)
 
         if isinstance(value, Variable):
-            if dtype in ['int64', 'int32']:
+            if dtype in ['uint8', 'int64', 'int32']:
                 attrs['str_value'] = str(int(value.numpy().item(0)))
             else:
                 attrs['str_value'] = str(float(value.numpy().item(0)))
@@ -706,9 +722,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
         inputs['ValueTensor'] = value
 
     check_shape(shape)
-    check_dtype(dtype, 'dtype',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'fill_constant')
+    check_dtype(
+        dtype, 'dtype',
+        ['bool', 'float16', 'float32', 'float64', 'uint8', 'int32', 'int64'],
+        'fill_constant')
     check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
 
     if out is not None:
@@ -1372,6 +1389,11 @@ def range(start, end, step, dtype, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
+    out_shape = None
+    if not isinstance(start, Variable) and not isinstance(
+            end, Variable) and not isinstance(step, Variable):
+        out_shape = [int(math.ceil((end - start) / step))]
+
     if not isinstance(start, Variable):
         with device_guard("cpu"):
             start = fill_constant([1], dtype, start, force_cpu=True)
@@ -1396,7 +1418,7 @@ def range(start, end, step, dtype, name=None):
     check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
                 'range/arange')
     helper = LayerHelper('range', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
+    out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
     helper.append_op(
         type='range',
         inputs={'Start': start,
diff --git a/python/paddle/fluid/multiprocess_utils.py b/python/paddle/fluid/multiprocess_utils.py
index a63825e73638b6af234b2596ce107c88bcfc94f9..82fb0f60b064fd1c8f02d4c73e7b8df97c9d6b56 100644
--- a/python/paddle/fluid/multiprocess_utils.py
+++ b/python/paddle/fluid/multiprocess_utils.py
@@ -25,6 +25,10 @@ if six.PY2:
 else:
     import queue
 
+# multi-process worker check indices queue interval, avoid
+# hanging in subprocess data loading
+MP_STATUS_CHECK_INTERVAL = 5.
+
 # NOTE: [ mmap files clear ] If there is still data in the multiprocess queue when the main process finishes reading,
 # the data in the queue needs to be popped. Then the LoDTensor read by the main process
 # from the child process will automatically clear the memory-mapped file.
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index c47cce76f89849199767bc0c9f43b3a7ff7b2a49..e8f8bdd3f9add035c1975d02c072a905deea4443 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import six
 from . import layers
 from .data_feeder import check_variable_and_dtype, convert_dtype
+from ..utils import deprecated
 
 __all__ = [
     "simple_img_conv_pool",
@@ -332,6 +333,7 @@ def sequence_conv_pool(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.glu")
 def glu(input, dim=-1):
     r"""
 	:api_attr: Static Graph
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 80f49ea939b64cf0478a60553120c5a0293fb052..41b2843ea33e936ccb2b75831bcbad4b7cb7cb4e 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1890,7 +1890,8 @@ class AdamOptimizer(Optimizer):
         beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates.
             It should be a float number or a Variable with shape [1] and data type as float32.
             The default value is 0.999.
-        epsilon (float, optional): A small float value for numerical stability.
+        epsilon (float|Tensor, optional): A small float value for numerical stability.
+            It should be a float number or a Variable with shape [1] and data type as float32.
             The default value is 1e-08.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
@@ -1959,7 +1960,7 @@ class AdamOptimizer(Optimizer):
                 avg_cost = fluid.layers.mean(cost)
 
                 # define beta decay variable
-                def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate):
+                def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate, epsilon_init):
                     global_step = lr_scheduler._decay_step_counter()
 
                     beta1 = fluid.layers.create_global_var(
@@ -1976,6 +1977,13 @@ class AdamOptimizer(Optimizer):
                         # set persistable for save checkpoints and resume
                         persistable=True,
                         name="beta2")
+                    epsilon = fluid.layers.create_global_var(
+                        shape=[1],
+                        value=float(epsilon_init),
+                        dtype='float32',
+                        # set persistable for save checkpoints and resume
+                        persistable=True,
+                        name="epsilon")
 
                     div_res = global_step / decay_steps
                     decayed_beta1 = beta1_init * (decay_rate**div_res)
@@ -1983,13 +1991,14 @@ class AdamOptimizer(Optimizer):
                     fluid.layers.assign(decayed_beta1, beta1)
                     fluid.layers.assign(decayed_beta2, beta2)
 
-                    return beta1, beta2
+                    return beta1, beta2, epsilon
 
-                beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9)
+                beta1, beta2, epsilon = get_decayed_betas(0.9, 0.99, 1e5, 0.9, 1e-8)
                 adam_optimizer = fluid.optimizer.AdamOptimizer(
                                                     learning_rate=0.01,
                                                     beta1=beta1,
-                                                    beta2=beta2)
+                                                    beta2=beta2,
+                                                    epsilon=epsilon)
                 adam_optimizer.minimize(avg_cost)
 
                 fetch_list = [avg_cost]
@@ -2099,7 +2108,6 @@ class AdamOptimizer(Optimizer):
             "Beta2PowOut": [beta2_pow_acc],
         }
         attrs = {
-            "epsilon": self._epsilon,
             "lazy_mode": self._lazy_mode,
             "min_row_size_to_use_multithread": 1000
         }
@@ -2112,6 +2120,10 @@ class AdamOptimizer(Optimizer):
             inputs['Beta2Tensor'] = self._beta2
         else:
             attrs['beta2'] = self._beta2
+        if isinstance(self._epsilon, Variable):
+            inputs['EpsilonTensor'] = self._epsilon
+        else:
+            attrs['epsilon'] = self._epsilon
 
         adam_op = block.append_op(
             type=self.type,
@@ -3784,6 +3796,12 @@ class PipelineOptimizer(object):
                              "Optimizer, but the given type is {}.".format(
                                  type(optimizer)))
         self._optimizer = optimizer
+
+        # Get the original optimizer defined by users, such as SGD
+        self._origin_optimizer = self._optimizer
+        while hasattr(self._origin_optimizer, "inner_opt"):
+            self._origin_optimizer = self._origin_optimizer.inner_opt
+
         assert num_microbatches >= 1, (
             "num_microbatches must be a positive value.")
         self._num_microbatches = num_microbatches
@@ -3797,13 +3815,107 @@ class PipelineOptimizer(object):
         self._op_role_var_key = op_maker.kOpRoleVarAttrName()
         self._op_device_key = op_maker.kOpDeviceAttrName()
         self._param_device_map = None
+        self._pipeline_pair = []
+        self._pp_ring_map = dict()
+
+    # insert allreduce op to sync global information for global
+    # gradient clip and amp
+    def _insert_allreduce_op(self, op_idx, block):
+        """
+        Insert allreduce op to sync global information for global
+        gradient clip and amp.
+        """
+        op = block.ops[op_idx]
+        out_name = op.desc.output_arg_names()[0]
+        out_var = block.var(out_name)
+        offset = 0
+        if op.type == "reduce_any":
+            # cast the bool var to int32 to use allreduce_max op
+            temp_var_name = unique_name.generate(out_name + "_cast_int32")
+            temp_var = block.create_var(
+                name=temp_var_name, shape=[1], dtype="int32")
+            block._insert_op(
+                op_idx + 1 + offset,
+                type='cast',
+                inputs={'X': out_var},
+                outputs={'Out': temp_var},
+                attrs={
+                    'in_dtype': out_var.dtype,
+                    'out_dtype': temp_var.dtype,
+                    self._op_role_key: self._op_role.Optimize
+                })
+            offset += 1
+        block._insert_op(
+            op_idx + 1 + offset,
+            type='c_allreduce_max'
+            if op.type == "reduce_any" else 'c_allreduce_sum',
+            inputs={'X': temp_var if op.type == "reduce_any" else out_var},
+            outputs={'Out': temp_var if op.type == "reduce_any" else out_var},
+            attrs={
+                'ring_id': self.global_ring_id,
+                self._op_role_key: self._op_role.Optimize,
+                'use_calc_stream': True
+            })
+        offset += 1
+        if op.type == "reduce_any":
+            block._insert_op(
+                op_idx + 1 + offset,
+                type='cast',
+                inputs={'X': temp_var},
+                outputs={'Out': out_var},
+                attrs={
+                    'in_dtype': temp_var.dtype,
+                    'out_dtype': out_var.dtype,
+                    self._op_role_key: self._op_role.Optimize
+                })
+        return offset
 
     def _create_vars(self, block, ori_block):
-        # Create vars for block, copied from main_program's global block
+        # Create vars for block, copied from ori_block
         used_var_set = set()
-        for op_idx in range(block.desc.op_size()):
-            op_desc = block.desc.op(op_idx)
-            vars = op_desc.input_arg_names() + op_desc.output_arg_names()
+        added_op_num = 0
+        op_idx = 0
+        op_size = block.desc.op_size()
+        while op_idx < op_size + added_op_num:
+            # Whether to insert allreduce_sum or allreduce_max op.
+            # For amp and global gradient clip strategies, we should
+            # get the global information, so allreduce op is needed.
+            should_insert = False
+            op = block.ops[op_idx]
+            # For op process vars on all devices, remove its input 
+            # vars not in this block
+            reserved_x = []
+            if op.type == 'reduce_any' and self._is_optimize_op(op):
+                should_insert = True
+            elif op.type == 'concat' and self._is_optimize_op(op):
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+            elif op.type == 'update_loss_scaling':
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+                op.desc.set_output('Out', reserved_x)
+            elif op.type == 'check_finite_and_unscale':
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+                op.desc.set_output('Out', reserved_x)
+                if len(reserved_x) == 0:
+                    block._remove_op(op_idx)
+                    op_size -= 1
+                    continue
+            elif op.type == 'sum' and self._is_gradient_clip_op(op):
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+                should_insert = True
+
+            vars = op.desc.input_arg_names() + op.desc.output_arg_names()
             for var in vars:
                 # a var whose name contains "blocking_queue" 
                 # only exists in startup program 
@@ -3813,27 +3925,39 @@ class PipelineOptimizer(object):
                 if block._find_var_recursive(str(var)): continue
                 source_var = ori_block._var_recursive(str(var))
                 if source_var.type == core.VarDesc.VarType.READER:
-                    block.create_var(
+                    dest_var = block.create_var(
                         name=var,
                         type=core.VarDesc.VarType.READER,
                         persistable=source_var.persistable)
                 else:
-                    block._clone_variable(source_var, False)
+                    dest_var = block._clone_variable(source_var, False)
+                dest_var.stop_gradient = source_var.stop_gradient
+            # When use with sharding, allreduce_sum and allreduce_max
+            # used for global gradient clip and amp will be added by sharding.
+            op_idx += 1
+            if self.use_sharding or not should_insert: continue
+            inserted_ops = self._insert_allreduce_op(op_idx - 1, block)
+            added_op_num += inserted_ops
+            op_idx += inserted_ops
+        block._sync_with_cpp()
 
     def _is_loss_grad_op(self, op):
-        if self._op_role_key not in op.attr_names:
-            return False
-        op_role = int(op.all_attrs()[self._op_role_key])
+        assert self._op_role_key in op.attr_names
+        op_role = int(op.attr(self._op_role_key))
         return op_role & int(self._op_role.Backward) and op_role & int(
             self._op_role.Loss)
 
     def _is_backward_op(self, op):
-        return self._op_role_key in op.attr_names and int(op.all_attrs()[
-            self._op_role_key]) & int(self._op_role.Backward)
+        return self._op_role_key in op.attr_names and (
+            int(op.attr(self._op_role_key)) & int(self._op_role.Backward))
+
+    def _is_loss_op(self, op):
+        assert self._op_role_key in op.attr_names
+        return int(op.attr(self._op_role_key)) == int(self._op_role.Loss)
 
     def _is_optimize_op(self, op):
-        return self._op_role_key in op.attr_names and int(op.all_attrs()[
-            self._op_role_key]) & int(self._op_role.Optimize)
+        return self._op_role_key in op.attr_names and (
+            int(op.attr(self._op_role_key)) & int(self._op_role.Optimize))
 
     def _is_update_op(self, op):
         return 'Param' in op.input_names and 'Grad' in op.input_names and (
@@ -3842,50 +3966,40 @@ class PipelineOptimizer(object):
     def _split_program(self, main_program, devices):
         """
         Split a program into sections according to devices that ops run on.
-        The ops of the role LRSched are copied to all sections.
+        The op whose op_device attr is "gpu:all" is copied to all sections.
 
         Args:
             main_program (Program): the main program
             devices: all used devices
         """
-        programs = []
         # Map from device to its corresponding section program info
-        device_program_map = dict()
-        for device in devices:
-            p = {'program': Program()}
-            device_program_map[device] = p
+        device_program_map = defaultdict(Program)
 
         block = main_program.block(0)
         for op in block.ops:
             device = op.attr(self._op_device_key)
-            op_role = op.attr(self._op_role_key)
-            if int(op_role) & int(self._op_role.LRSched):
-                # Copy ops of the role LRSched to all sections.
-                for device in device_program_map.keys():
+            # Copy ops whose op_device set to "gpu:all" to all sections.
+            if device == "gpu:all":
+                for device in devices:
                     program = device_program_map[device]
                     op_desc = op.desc
-                    ap_op = program["program"].block(0).desc.append_op()
-                    ap_op.copy_from(op_desc)
-                    # ap_op._set_attr(self._op_device_key, "")
-            elif op.type == "create_py_reader" or op.type == "read" or op.type == "create_double_buffer_reader":
-                # Copy read related ops to all section to make them exit after each epoch.
-                for device in device_program_map.keys():
-                    program = device_program_map[device]
-                    op_desc = op.desc
-                    ap_op = program["program"].block(0).desc.append_op()
+                    ap_op = program.global_block().desc.append_op()
                     ap_op.copy_from(op_desc)
+                    ap_op._set_attr(self._op_device_key, "")
             else:
                 program = device_program_map[device]
                 op_desc = op.desc
-                ap_op = program["program"].block(0).desc.append_op()
+                ap_op = program.global_block().desc.append_op()
                 ap_op.copy_from(op_desc)
+                ap_op._set_attr(self._op_device_key, "")
 
+        program_list = []
         for key in devices:
             program = device_program_map[key]
-            program['program']._sync_with_cpp()
-            programs.append(program)
+            program._sync_with_cpp()
+            program_list.append(program)
 
-        return programs
+        return program_list
 
     def _get_op_device_for_startup_program(self, var_name):
         """
@@ -3894,21 +4008,22 @@ class PipelineOptimizer(object):
         get the real op_device attribute of the fill_constant as the device
         where the corresponding parameters on.
         """
-        assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name
+        assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name, \
+            'For accumulators for Adam, the name must contain beta1_pow_acc ' \
+            'or beta2_pow_acc.'
         param_name = var_name[0:var_name.index('_beta')]
         device = self._param_device_map[param_name]
         return device
 
-    def _split_startup_program(self, startup_program, local_rank):
-        block = startup_program.block(0)
+    def _split_startup_program(self, startup_program, device_id):
+        block = startup_program.global_block()
         new_startup_program = Program()
         for op in block.ops:
             device = op.attr(self._op_device_key)
             if device == "cpu":
                 assert op.type == "fill_constant", (
-                    "For ops in startup "
-                    "program that with the op_device attribute of cpu, "
-                    "they must be fill_constant.")
+                    "For ops in startup program with the op_device attribute "
+                    "of cpu, they must be of type fill_constant.")
                 output_var = op.output_arg_names[0]
                 device = self._get_op_device_for_startup_program(output_var)
 
@@ -3917,74 +4032,51 @@ class PipelineOptimizer(object):
             else:
                 # LR related ops
                 device = None
-            if device and device_index != local_rank: continue
+            if device and device_index != device_id: continue
             op_desc = op.desc
-            ap_op = new_startup_program.block(0).desc.append_op()
+            ap_op = new_startup_program.global_block().desc.append_op()
             ap_op.copy_from(op_desc)
             ap_op._set_attr(self._op_device_key, "")
         new_startup_program._sync_with_cpp()
-        self._create_vars(
-            new_startup_program.block(0), startup_program.global_block())
+        self._create_vars(new_startup_program.global_block(), block)
         return new_startup_program
 
-    def _find_post_op(self, ops, cur_op, var_name):
+    def _find_post_op(self, index, var_name):
         """
-        Find the real post op that has variable named var_name as input.
-
-        Args:
-            ops (list): A list of ops.
-            cur_op (Operator): Current operator which has variable named
-                               var_name as output.
-            var_name (string): Variable name.
+        Find the post op that has variable named var_name as input.
         """
-        post_op = []
-        before = True
-        for op in ops:
-            if op == cur_op:
-                before = False
-                continue
-            if before:
-                continue
-            for in_var_name in op.input_arg_names:
-                if in_var_name == var_name:
-                    post_op.append(op)
-                    break
-        if post_op:
-            return post_op[0]
-        return None
+        # bugfix for uniform hybrid parallelism
+        if '.cast_fp32' in var_name:
+            var_name = var_name.replace('.cast_fp32', '')
+        if '.cast_fp16' in var_name:
+            var_name = var_name.replace('.cast_fp16', '')
+
+        post_ops = self.input_var_to_op[var_name]
+        if post_ops == None: return None
+        result_op = None
+        for post_op, post_idx in reversed(post_ops):
+            if post_idx > index:
+                result_op = post_op
+                break
+        return result_op
 
-    def _find_real_prev_op(self, ops, cur_op, var_name):
+    def _find_prev_op(self, index, var_name):
         """
-        Find the real previous op that outputs variable named var_name.
-
-        Args:
-            ops (list): A list of ops.
-            cur_op (Operator): Current operator which has variable named
-                               var_name as input.
-            var_name (string): Variable name.
+        Find the previous op of op with index that outputs
+        variable named var_name.
         """
-        prev_op = []
-        for op in ops:
-            if op.type == 'send_v2' or op.type == 'recv_v2':
-                continue
-            if op == cur_op:
+        prev_ops = self.output_var_to_op[var_name]
+        if prev_ops == None: return None
+        result_op = None
+        for prev_op, prev_idx in reversed(prev_ops):
+            if prev_idx < index:
+                result_op = prev_op
                 break
-            for out_var_name in op.output_arg_names:
-                if out_var_name == var_name:
-                    prev_op.append(op)
-        if prev_op:
-            # A op may have more than one prev op,
-            # e.g., for 'learning_rate', there may be multiple ops have it as
-            # output.
-            return prev_op[-1]
-        return None
+        return result_op
 
     def _rename_arg(self, op, old_name, new_name):
-        op_desc = op.desc
-        if isinstance(op_desc, tuple):
-            op_desc = op_desc[0]
-        op_desc._rename_input(old_name, new_name)
-        op_desc._rename_output(old_name, new_name)
+        op._rename_input(old_name, new_name)
+        op._rename_output(old_name, new_name)
 
     def _create_var(self, block, ref_var, name):
         """
@@ -3998,99 +4090,12 @@ class PipelineOptimizer(object):
             dtype=ref_var.dtype,
             type=ref_var.type,
             lod_level=ref_var.lod_level,
-            persistable=False,
-            is_data=False,
+            persistable=ref_var.persistable,
+            is_data=ref_var.is_data,
             need_check_feed=ref_var.desc.need_check_feed())
+        new_var.stop_gradient = ref_var.stop_gradient
         return new_var
 
-    def _get_data_var_info(self, block):
-        """
-        Get info of all vars whose is_data attribute are true.
-        """
-        # map of data vars to devices that that data on
-        data_devices_map = dict()
-        for op in block.ops:
-            dev_spec = op.attr(self._op_device_key)
-            for var_name in op.input_arg_names:
-                if "blocking_queue" in var_name: continue
-                var = block.var(var_name)
-                if not var.is_data:
-                    continue
-                if not var_name in data_devices_map:
-                    data_devices_map[var_name] = []
-                if not dev_spec in data_devices_map[var_name]:
-                    data_devices_map[var_name].append(dev_spec)
-        return data_devices_map
-
-    def _insert_sendrecv_for_data_var(self, main_block, programs, startup,
-                                      devices):
-        """
-        Insert send and recv ops for data var that on other devices.
-
-        Args:
-            main_block (Block): Global block for main program
-            programs (dict): Dictionary for section params
-            startup (Program): Startup program
-            devices (list): List of devices in the format (dev:dev_index)
-        """
-        main_program = main_block.program
-        data_devices_map = self._get_data_var_info(main_block)
-
-        first_prog = programs[0]['program']
-        first_block = first_prog.block(0)
-        insert_index = 0
-        for op in first_block.ops:
-            insert_index += 1
-            if op.type == "read":
-                break
-        first_dev_spec = devices[0]
-        first_dev_index = int(first_dev_spec.split(':')[1])
-        for var_name in data_devices_map.keys():
-            for device in data_devices_map[var_name]:
-                if device == first_dev_spec: continue
-                main_var = main_block.var(var_name)
-                assert main_var.is_data
-                if not var_name in first_block.vars:
-                    self._create_var(first_block, main_var, var_name)
-                dev_index = int(device.split(':')[1])
-                first_block._insert_op(
-                    index=insert_index,
-                    type='send_v2',
-                    inputs={'X': first_block.var(var_name)},
-                    attrs={
-                        self._op_device_key: first_dev_spec,
-                        self._op_role_key: self._op_role.Forward,
-                        'use_calc_stream': True,
-                        'peer': dev_index,
-                    })
-                # Get the device that that data on
-                assert device in devices
-                prog_index = devices.index(device)
-                prog = programs[prog_index]['program']
-                block = prog.block(0)
-                index = 0
-                for op in block.ops:
-                    index += 1
-                    if op.type == "read":
-                        break
-                source_var = main_program.block(0).var(var_name)
-                new_var = self._create_var(block, source_var, var_name)
-                new_var_shape = list(new_var.shape)
-                new_var_shape[0] = self.micro_batch_size if new_var_shape[
-                    0] < 0 else new_var_shape[0]
-                block._insert_op(
-                    index=index,
-                    type='recv_v2',
-                    outputs={'Out': [new_var]},
-                    attrs={
-                        'out_shape': new_var_shape,
-                        'dtype': new_var.dtype,
-                        self._op_device_key: device,
-                        self._op_role_key: self._op_role.Forward,
-                        'peer': first_dev_index,
-                        'use_calc_stream': True,
-                    })
-
     def _strip_grad_suffix(self, name):
         """
         Strip the grad suffix from the given variable name
@@ -4104,257 +4109,508 @@ class PipelineOptimizer(object):
         """
         return name + core.grad_var_suffix()
 
-    def _add_opdevice_attr_for_regularization_clip(self, block):
+    def _get_op_device_attr(self, op):
         """
-        Add op_device attribute for regulization and clip ops.
+        Get the op_device attribute of a op.
         """
-        for op in block.ops:
-            # role for regularization and clip ops is optimize
-            if int(op.attr(self._op_role_key)) != int(self._op_role.Optimize):
-                continue
-            if op.has_attr(self._op_device_key) and (
-                    op.attr(self._op_device_key) != ""):
-                continue
-            assert self._op_role_var_key in op.attr_names
-            op_role_var = op.all_attrs()[self._op_role_var_key]
-            assert len(op_role_var) == 2
+        device = op.attr(self._op_device_key) \
+            if op.has_attr(self._op_device_key) else None
+        if device:
+            assert device[0:3] == 'gpu' or dev_type == 'npu', "Now, only gpu and npu devices are " \
+                "supported in pipeline parallemism."
+        return device
+
+    def _add_op_device_attr_for_op(self, op, idx, block):
+        """
+        Add op_device attrribute for ops that have not that attribute set.
+        We use "gpu:all" to represent the op should be put on all
+        sub-programs, such as lr-related ops. Note that: "gpu:all"
+        is only used by pipeline as an indicator.
+        """
+        lrsched_role = int(self._op_role.LRSched)
+        if op.attr(self._op_role_key) == lrsched_role:
+            # For LRSched ops, we should put them on all sub-programs to
+            # make sure each sub-program update the lr correctly
+            op._set_attr(self._op_device_key, "gpu:all")
+        # bugfix in hybrid parallelism
+        elif op.type == "sum" and self._is_backward_op(op):
+            # For sum ops that compute the sum of @RENAMED@ vars
+            for name in op.desc.input_arg_names():
+                assert '@RENAME@' in name, \
+                    "The op must be sum used to accumulate renamed vars."
+            assert len(op.desc.output_arg_names()) == 1
+            out_name = op.desc.output_arg_names()[0]
+            post_op = self._find_post_op(idx, out_name)
+            assert post_op.has_attr(
+                'op_device'), "{} has no op_device attr for var {}".format(
+                    post_op.type, out_name)
+            device = post_op.attr(self._op_device_key)
+            assert device, "The post op must have op_device set."
+            op._set_attr(self._op_device_key, device)
+        elif (op.type == "cast" or
+              op.type == "scale") and self._is_backward_op(op):
+            prev_op = self._find_prev_op(idx, op.desc.input("X")[0])
+            op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key))
+        elif op.type == "memcpy" and not self._is_optimize_op(op):
+            # for checkpoint offloading
+            assert len(op.input_arg_names) == 1 and len(
+                op.output_arg_names) == 1
+            input_name = op.input_arg_names[0]
+            output_name = op.output_arg_names[0]
+            if '@Fetch' in output_name:
+                post_op = self._find_post_op(idx, output_name)
+                op._set_attr(self._op_device_key,
+                             post_op.attr(self._op_device_key))
+            else:
+                prev_op = self._find_prev_op(idx, op.desc.input("X")[0])
+                op._set_attr(self._op_device_key,
+                             prev_op.attr(self._op_device_key))
+        elif self._is_loss_op(op):
+            # For loss * loss_scaling op added by AMP
+            offset = 1
+            while (not block.ops[idx + offset].has_attr(self._op_device_key) or
+                   not block.ops[idx + offset].attr(self._op_device_key)):
+                offset += 1
+            device = block.ops[idx + offset].attr(self._op_device_key)
+            assert device, "Please put you program within device_guard scope."
+            for i in range(offset):
+                block.ops[idx + i]._set_attr(self._op_device_key, device)
+        elif self._is_optimize_op(op) and op.type == "cast":
+            # For fp16-->fp32 cast added by AMP
+            grad_name = op.output('Out')
+            assert len(grad_name) == 1
+            param_name = self._strip_grad_suffix(grad_name[0])
+            device = self._param_device_map[param_name]
+            op._set_attr(self._op_device_key, device)
+        elif self._is_gradient_clip_op(op) or self._is_regularization_op(op):
+            # For gradient clip and regularization ops, we set their op_device
+            # attribute to the device where their corresponding parameters on.
+            assert self._op_role_var_key in op.attr_names, "gradient_clip " \
+                "and regularization ops must have op_role_var attribute."
+            op_role_var = op.attr(self._op_role_var_key)
+            assert len(op_role_var) == 2, "op_role_var for gradient_clip " \
+                "regularization ops must have two elements."
             param_name = op_role_var[0]
             device = self._param_device_map[param_name]
+            # For sum op added by global gradient clip, it must be 
+            # put on all devices
+            if (op.type == 'sum' or op.type == 'sqrt' or
+                    op.type == 'fill_constant' or
+                    op.type == 'elementwise_max' or
+                    op.type == 'elementwise_div'):
+                device = "gpu:all"
             op._set_attr(self._op_device_key, device)
-
-    def _add_default_opdevice_attr(self, block):
+        else:
+            other_known_ops = [
+                'update_loss_scaling',
+                'reduce_any',
+                'concat',
+                'sum',
+                'check_finite_and_unscale',
+            ]
+            assert op.type in other_known_ops, "For other ops without " \
+                "op_device set, they must be one of {}, but it " \
+                "is {}".format(other_known_ops, op.type)
+            assert self._is_optimize_op(op)
+            op._set_attr(self._op_device_key, "gpu:all")
+
+    def _add_op_device_attr(self, block):
         """
-        1. Add default op_device attribute for lr-related ops.
-           The default value is the one that of the first place.
-        2. Add default op_device attribute for sum ops added during
-           backward. For these ops, we set the op_device attribute
-           as the one of its post op, i.e, which op has the output of the
-           sum op as an input.
+        Add op_device attrribute for ops in block that have 
+        not that attribute set.
         """
-        first_devcie = ""
-
-        # Get the device spec of the first place.
-        # device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device,
-        # e.g. 'gpu:0', 'gpu:1', etc.
-        for op in block.ops:
-            if op.has_attr(self._op_device_key) and (
-                    op.attr(self._op_device_key) != ""):
-                first_device = op.attr(self._op_device_key)
-                break
-        assert first_device
-        first_device_type = first_device.split(":")[0]
-        assert first_device_type == "gpu"
-
-        # set op_device attr for lr-related ops
-        lrsched_role = int(self._op_role.LRSched)
-        for op in block.ops:
-            if not op.has_attr(self._op_device_key) or (
-                    op.attr(self._op_device_key) == ""):
-                if op.type == "sum":
-                    # For sum ops that compute the sum of @RENAMED@ vars
-                    for name in op.desc.input_arg_names():
-                        assert '@RENAME@' in name
-                    assert len(op.desc.output_arg_names()) == 1
-                    out_name = op.desc.output_arg_names()[0]
-                    post_op = self._find_post_op(block.ops, op, out_name)
-                    device = post_op.attr(self._op_device_key)
-                    assert device
-                    op._set_attr(self._op_device_key, device)
-                    continue
-
-                assert op.attr(self._op_role_key) == lrsched_role, (
-                    "Op whose op_device attr has not been set for pipeline"
-                    " must be of the role LRSched.")
-                op._set_attr(self._op_device_key, first_device)
+        for idx, op in enumerate(list(block.ops)):
+            if (op.type == "create_py_reader" or op.type == "read" or
+                    op.type == "create_double_buffer_reader"):
+                # Copy read related ops to all section to make them exit 
+                # after each epoch.
+                # We use "gpu:all" to represent the op should be put on all
+                # sub-programs, such as lr-related ops. Note that: "gpu:all"
+                # is only used by pipeline as an indicator.
+                op._set_attr(self._op_device_key, "gpu:all")
+                continue
+            # op_device attribute has been set
+            if self._get_op_device_attr(op): continue
+            self._add_op_device_attr_for_op(op, idx, block)
 
     def _check_validation(self, block):
         """
-        Check whether ops in a block are all validate (i.e., the 
-        op_device attribute has been set).
-        Then, return all device specifications in order.
+        Check whether ops in a block have both the op_device and the 
+        op_role attributes set.
+        Then, return all devices in order.
         """
-        device_specs = []
+        device_list = []
+        # Section worker only supports the following op_role
+        valid_op_role_value = [
+            int(self._op_role.LRSched),
+            int(self._op_role.Forward),
+            int(self._op_role.Backward),
+            int(self._op_role.Loss),
+            int(self._op_role.Optimize),
+            int(self._op_role.Backward) | int(self._op_role.Loss),
+        ]
         for op in block.ops:
-            type = op.type
-            if not op._has_kernel(type):
+            if not op._has_kernel(op.type):
                 assert op.type == "conditional_block" and (
                     op.attr(self._op_role_key) == int(self._op_role.LRSched)), (
                         "Now, the only supported op without kernel is "
                         "conditional_block, and its op role must be LRSched.")
+            assert op.has_attr(self._op_role_key), (
+                "op ({}) has no {} attribute.".format(op.type,
+                                                      self._op_role_key))
+            assert int(op.attr(self._op_role_key)) in valid_op_role_value, \
+                "op_role {} for op {} must be one of {}".format(
+                    op.attr(self._op_role_key),
+                    op.type,
+                    valid_op_role_value)
             assert op.has_attr(self._op_device_key), (
                 "op ({}) has no {} attribute.".format(op.type,
                                                       self._op_device_key))
-            dev_spec = op.attr(self._op_device_key)
-            assert dev_spec, ("op_device attribute for op "
-                              "{} has not been set.".format(op.type))
-            dev_type = dev_spec.split(':')[0]
+
+            device = op.attr(self._op_device_key)
+            assert device, ("op_device attribute for op "
+                            "{} has not been set.".format(op.type))
+            if device == "gpu:all": continue
+            dev_type = device.split(':')[0]
             assert dev_type == "gpu", ("Now only gpu devices are supported "
                                        "for pipeline parallelism.")
-            if not dev_spec in device_specs:
-                device_specs.append(dev_spec)
-        return device_specs
+            if not device in device_list:
+                device_list.append(device)
+        return device_list
 
     def _insert_sendrecv_ops_for_boundaries(self, block):
         """
         Insert a pair of send and recv ops for every two
         consecutive ops on different devices.
         """
-        extra_index = 0
-
-        # A map from var to device spec where op takes it as input,
+        # A map from var to device where op takes it as input,
         # avoiding multiple send and recv ops.
-        var_devspec = dict()
-
+        input_var_to_device = dict()
+        # bugfix hybrid parallelism
+        first_optimize_index = None
         for index, op in enumerate(list(block.ops)):
-            # skips lr-related ops and vars, as we will process them later.
-            if int(op.attr(self._op_role_key)) & int(self._op_role.LRSched):
-                continue
-            # skips update ops and vars, as we will process them later.
-            if self._is_update_op(op): continue
+            if self._is_optimize_op(op):
+                first_optimize_index = index
+                break
+        extra_index_info = {
+            'index': 0,
+            'first_optimize_index': first_optimize_index
+        }
 
-            cur_device_spec = op.attr(self._op_device_key)
+        for index, op in enumerate(list(block.ops)):
+            cur_device = op.attr(self._op_device_key)
+            if cur_device == "gpu:all": continue
             for var_name in op.input_arg_names:
-                # i.e., lod_tensor_blocking_queue created by DataLoader,
-                # which only exists in startup program.
-                if not var_name in block.vars: continue
                 var = block.var(var_name)
-                # skip data, because we will process it later
+                # skip data var
                 if var.is_data: continue
-                prev_op = self._find_real_prev_op(block.ops, op, var_name)
-                if prev_op is None:
-                    continue
-                prev_device_spec = prev_op.attr(self._op_device_key)
+                prev_device = None
+                generate_ops = self.output_var_to_op.get(var_name)
+                if generate_ops is None:
+                    if var_name not in self._param_device_map:
+                        continue
+                    prev_device = self._param_device_map[var_name]
+
+                prev_op = self._find_prev_op(index, var_name)
+
+                if not prev_device:
+                    prev_device = prev_op.attr(self._op_device_key) \
+                        if prev_op else None
+
+                if prev_device is None or prev_device == "gpu:all": continue
 
-                if prev_device_spec != cur_device_spec:
-                    if var_name not in var_devspec:
-                        var_devspec[var_name] = []
-                    if cur_device_spec in var_devspec[var_name]: continue
-                    var_devspec[var_name].append(cur_device_spec)
+                if prev_device == cur_device: continue
 
-                    op_role = op.all_attrs()[self._op_role_key]
+                if var_name not in input_var_to_device:
+                    input_var_to_device[var_name] = []
+                if (cur_device, prev_device) in input_var_to_device[var_name]:
+                    continue
+
+                device_type = cur_device.split(':')[0] + ':'
+
+                def _insert_send_recv(cur_id, prev_id):
+                    cur_dev = device_type + str(cur_id)
+                    prev_dev = device_type + str(prev_id)
+                    if (cur_dev, prev_dev) in input_var_to_device[var_name]:
+                        return
+
+                    if cur_id - prev_id > 1:
+                        _insert_send_recv(cur_id - 1, prev_id)
+                        _insert_send_recv(cur_id, cur_id - 1)
+                        input_var_to_device[var_name].append(
+                            (cur_dev, prev_dev))
+                        return
+                    elif cur_id - prev_id < -1:
+                        _insert_send_recv(cur_id + 1, prev_id)
+                        _insert_send_recv(cur_id, cur_id + 1)
+                        input_var_to_device[var_name].append(
+                            (cur_dev, prev_dev))
+                        return
+
+                    assert abs(cur_id - prev_id) == 1
+                    input_var_to_device[var_name].append((cur_dev, prev_dev))
+
+                    op_role = op.attr(self._op_role_key)
                     var = block.vars[var_name]
-                    prev_device_index = int(prev_device_spec.split(':')[1])
-                    cur_device_index = int(cur_device_spec.split(':')[1])
-                    block._insert_op(
-                        index=index + extra_index,
-                        type='send_v2',
-                        inputs={'X': var},
-                        attrs={
-                            self._op_device_key: prev_device_spec,
-                            self._op_role_key: op_role,
-                            'use_calc_stream': True,
-                            'peer': cur_device_index,
-                        })
-                    extra_index += 1
-                    var_shape = list(var.shape)
-                    var_shape[0] = self.micro_batch_size if var_shape[
-                        0] < 0 else var_shape[0]
-                    block._insert_op(
-                        index=index + extra_index,
-                        type='recv_v2',
-                        outputs={'Out': [var]},
-                        attrs={
-                            'out_shape': var_shape,
-                            'dtype': var.dtype,
-                            self._op_device_key: cur_device_spec,
-                            self._op_role_key: op_role,
-                            'use_calc_stream': True,
-                            'peer': prev_device_index,
-                        })
-                    extra_index += 1
+                    pair = (prev_id, cur_id)
+                    # 1000 is just a magic number
+                    pair_key = prev_id * 1000 + cur_id
+                    if pair not in self._pipeline_pair:
+                        self._pipeline_pair.append(pair)
+                        self._pp_ring_map[pair_key] = self.ring_id
+                        ring_id = self.ring_id
+                        self.ring_id += 1
+                    else:
+                        ring_id = self._pp_ring_map[pair_key]
+
+                    if self.schedule_mode == 'F-then-B':  # F-then-B
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
+                            type='send_v2',
+                            inputs={'X': var},
+                            attrs={
+                                self._op_device_key: prev_dev,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': True,
+                                'peer': 1,
+                                'ring_id': ring_id
+                            })
+                        extra_index_info['index'] += 1
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
+                            type='recv_v2',
+                            outputs={'Out': [var]},
+                            attrs={
+                                'out_shape': var.shape,
+                                'dtype': var.dtype,
+                                self._op_device_key: cur_dev,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': True,
+                                'peer': 0,
+                                'ring_id': ring_id
+                            })
+                        extra_index_info['index'] += 1
+                    elif self.schedule_mode == '1F1B':  # 1F1B
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
+                            type='c_sync_calc_stream',
+                            inputs={'X': [var]},
+                            outputs={'Out': [var]},
+                            attrs={
+                                self._op_device_key: prev_dev,
+                                self._op_role_key: op_role,
+                            })
+                        extra_index_info['index'] += 1
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
+                            type='send_v2',
+                            inputs={'X': var},
+                            attrs={
+                                self._op_device_key: prev_dev,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': False,
+                                'ring_id': ring_id,
+                                'peer': 1,
+                            })
+                        extra_index_info['index'] += 1
+                        insert_index = None
+                        if int(op_role) == int(self._op_role.Backward):
+                            insert_index = extra_index_info[
+                                'first_optimize_index']
+                            new_op_role = self._op_role.Optimize
+                        else:
+                            insert_index = index
+                            new_op_role = self._op_role.Backward
+                        block._insert_op_without_sync(
+                            index=insert_index + extra_index_info['index'],
+                            type='c_sync_comm_stream',
+                            inputs={'X': [var]},
+                            outputs={'Out': [var]},
+                            attrs={
+                                self._op_device_key: prev_dev,
+                                self._op_role_key: new_op_role,
+                                'ring_id': ring_id,
+                            })
+                        if int(op_role) == int(self._op_role.Forward):
+                            extra_index_info['index'] += 1
+                        var_shape = list(var.shape)
+                        var_shape[0] = self.micro_batch_size if var_shape[
+                            0] < 0 else var_shape[0]
+                        block._insert_op_without_sync(
+                            index=index + extra_index_info['index'],
+                            type='recv_v2',
+                            outputs={'Out': [var]},
+                            attrs={
+                                'out_shape': var_shape,
+                                'dtype': var.dtype,
+                                self._op_device_key: cur_dev,
+                                self._op_role_key: op_role,
+                                'use_calc_stream': True,
+                                'peer': 0,
+                                'ring_id': ring_id
+                            })
+                        extra_index_info['index'] += 1
+                    else:
+                        raise ValueError(
+                            "Now only 'F-then-B' and '1F1B' are supported."
+                            "The given value is {}.".format(self.schedule_mode))
 
-    def _clear_gradients(self, main_block, dev_spec):
-        """
-        Clear gradients at the begining of each run of a minibatch.
-        """
-        for param_name in self._param_device_map:
-            device = self._param_device_map[param_name]
-            if device != dev_spec: continue
-            grad_name = self._append_grad_suffix(param_name)
-            if not main_block.has_var(grad_name): continue
-            grad_var = main_block.vars[grad_name]
-            main_block._insert_op(
-                index=0,
-                type='fill_constant',
-                inputs={},
-                outputs={'Out': [grad_var]},
-                attrs={
-                    'shape': grad_var.shape,
-                    'dtype': grad_var.dtype,
-                    'value': float(0),
-                    self._op_device_key: device,
-                    # a trick to run this op once per mini-batch
-                    self._op_role_key: self._op_role.Optimize.LRSched,
-                })
+                _insert_send_recv(
+                    int(cur_device.split(':')[1]),
+                    int(prev_device.split(':')[1]))
+        block._sync_with_cpp()
 
-    def _accumulate_gradients(self, block):
+    def _insert_loss_scale(self, block):
         """
-        Accumulate the gradients generated in microbatch to the one in mini-batch.
-        We also scale the loss corresponding to number of micro-batches as well.
+        Scale the loss corresponding to number of micro-batches.
         """
+        if self._num_microbatches == 1: return
         for index, op in reversed(tuple(enumerate(list(block.ops)))):
-            offset = index
-            device = op.attr(self._op_device_key)
-
-            # Backward pass
             if self._is_loss_grad_op(op):
                 loss_grad_var = block.vars[op.output_arg_names[0]]
-                scale_factor = self._num_microbatches
                 block._insert_op(
                     index=index + 1,
                     type='scale',
                     inputs={'X': loss_grad_var},
                     outputs={'Out': loss_grad_var},
                     attrs={
-                        'scale': 1.0 / scale_factor,
-                        self._op_device_key: device,
+                        'scale': 1.0 / self._num_microbatches,
                         self._op_role_key: self._op_role.Backward
                     })
                 break
-            if self._is_backward_op(op) and (
-                    self._op_role_var_key in op.attr_names):
-                op_role_var = op.all_attrs()[self._op_role_var_key]
 
-                if len(op_role_var) == 0:
+    def _rename_gradient_var_name(self, block):
+        for index, op in enumerate(block.ops):
+            if not self._is_optimize_op(op): continue
+            input_names = op.input_arg_names
+            output_names = op.output_arg_names
+            in_out_names = input_names + output_names
+            if op.type == 'cast': continue
+            # append "MERGED" to the names of parameter gradients,
+            # and mofify the op_role_var attribute (by rename_arg func).
+            for name in in_out_names:
+                if not core.grad_var_suffix() in name: continue
+                param_name = name.strip(core.grad_var_suffix())
+                new_grad_name = name + "@MERGED"
+                self._rename_arg(op, name, new_grad_name)
+
+    def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False):
+        """
+        Create a new merged gradient for each parameter and accumulate the
+        corresponding gradient to it.
+        """
+        merged_gradient_names = []
+        first_opt_op_idx = None
+
+        for index, op in reversed(tuple(enumerate(list(block.ops)))):
+            # remove the cast op of fp16 grad to fp32 grad
+            if self._is_optimize_op(op) and op.type == 'cast':
+                in_name = op.input_arg_names[0]
+                out_name = op.output_arg_names[0]
+                if out_name.strip('@GRAD') in self._param_device_map:
+                    assert in_name.replace('.cast_fp16', '') == out_name
+                    block._remove_op(index)
                     continue
+
+            if self._is_backward_op(op) and not first_opt_op_idx:
+                first_opt_op_idx = index + 1
+                # no optimize phase
+                if first_opt_op_idx == len(block.ops): return
+                if block.ops[first_opt_op_idx].type == "c_sync_comm_stream":
+                    first_opt_op_idx += 1
+
+            if self._is_backward_op(op) and (
+                    self._op_role_var_key in op.attr_names):
+                op_role_var = op.attr(self._op_role_var_key)
+                if len(op_role_var) == 0: continue
                 assert len(op_role_var) % 2 == 0
-                offset = index
                 for i in range(0, len(op_role_var), 2):
-                    grad_name = op_role_var[i + 1]
-                    grad_var = block.vars[grad_name]
-                    new_grad_var_name = unique_name.generate(grad_name)
-                    new_var = self._create_var(block, grad_var,
-                                               new_grad_var_name)
-                    self._rename_arg(op, grad_name, new_grad_var_name)
+                    offset = 0
+                    param_name = op_role_var[i]
+                    if not block.has_var(param_name): continue
+                    if '@BroadCast' in param_name: continue
+                    param_grad_name = param_name + core.grad_var_suffix()
+                    merged_param_grad_name = param_grad_name + '@MERGED'
+                    if not block.has_var(merged_param_grad_name):
+                        self._create_var(block, block.vars[param_name],
+                                         merged_param_grad_name)
+                    assert block.has_var(merged_param_grad_name)
+                    param_grad_var = block.var(param_grad_name)
+                    merged_param_grad_var = block.var(merged_param_grad_name)
+                    merged_param_grad_var.persistable = True
                     block._insert_op(
-                        index=offset + 1,
-                        type='sum',
-                        inputs={'X': [grad_var, new_var]},
-                        outputs={'Out': grad_var},
+                        index=first_opt_op_idx + offset,
+                        type='fill_constant',
+                        inputs={},
+                        outputs={'Out': [merged_param_grad_var]},
                         attrs={
-                            self._op_device_key: device,
-                            self._op_role_key: self._op_role.Backward,
-                            self._op_role_var_key: op_role_var
+                            'shape': merged_param_grad_var.shape,
+                            'dtype': merged_param_grad_var.dtype,
+                            'value': float(0),
+                            # a trick to run this op once per mini-batch
+                            self._op_role_key: self._op_role.Optimize.LRSched,
                         })
                     offset += 1
+                    grad_name = op_role_var[i + 1]
+                    grad_var = block.vars[grad_name]
+                    if not 'cast_fp16' in grad_name:
+                        block._insert_op(
+                            index=first_opt_op_idx + offset,
+                            type='sum',
+                            inputs={'X': [grad_var, merged_param_grad_var]},
+                            outputs={'Out': merged_param_grad_var},
+                            attrs={
+                                self._op_role_key: self._op_role.Backward,
+                            })
+                        offset += 1
+                        merged_gradient_names.append(merged_param_grad_name)
+                    else:
+                        # cast gradient to fp32 to accumulate to merged gradient
+                        cast_grad_var_name = param_grad_name + '@TMP'
+                        cast_grad_var = self._create_var(block, param_grad_var,
+                                                         cast_grad_var_name)
+                        cast_grad_var.persistable = False
+                        block._insert_op(
+                            index=first_opt_op_idx + offset,
+                            type='cast',
+                            inputs={'X': grad_var},
+                            outputs={'Out': cast_grad_var},
+                            attrs={
+                                'in_dtype': grad_var.dtype,
+                                'out_dtype': cast_grad_var.dtype,
+                                self._op_role_key: self._op_role.Backward,
+                            })
+                        offset += 1
+                        block._insert_op(
+                            index=first_opt_op_idx + offset,
+                            type='sum',
+                            inputs={
+                                'X': [merged_param_grad_var, cast_grad_var]
+                            },
+                            outputs={'Out': merged_param_grad_var},
+                            attrs={
+                                self._op_role_key: self._op_role.Backward,
+                            })
+                        offset += 1
+                        merged_gradient_names.append(merged_param_grad_name)
+        return merged_gradient_names
 
     def _add_sub_blocks(self, main_block, program_list):
         main_program = main_block.program
-        for prog_info in program_list:
-            prog = prog_info['program']
+        for prog in program_list:
             for op in prog.block(0).ops:
                 if not op.has_attr('sub_block'):
                     continue
                 origin_sub_block_id = op.attr('sub_block').id
                 origin_sub_block = main_program.block(origin_sub_block_id)
                 new_sub_block = prog._create_block(parent_idx=0)
-                for op in origin_sub_block.ops:
-                    op_desc = op.desc
+                for sub_op in origin_sub_block.ops:
+                    op_desc = sub_op.desc
                     ap_op = new_sub_block.desc.append_op()
                     ap_op.copy_from(op_desc)
                 new_sub_block._sync_with_cpp()
                 self._create_vars(new_sub_block, origin_sub_block)
-                op._set_attr('sub_block:', new_sub_block)
+                op._set_attr('sub_block', new_sub_block)
 
     def _get_device_info(self, block):
         for op in block.ops:
@@ -4371,8 +4627,7 @@ class PipelineOptimizer(object):
         # var_info = {var_name: [program1, program2...]},
         # persistable var only
         var_info = dict()
-        for prog_info in program_list:
-            prog = prog_info['program']
+        for prog in program_list:
             block = prog.block(0)
             for var_name in block.vars:
                 if var_name == "double_buffer_0": continue
@@ -4394,7 +4649,7 @@ class PipelineOptimizer(object):
                 block = prog.block(0)
                 for op in block.ops:
                     if op.type == "recv_v2" or op.type == "create_py_reader" or \
-                        op.type == "read":
+                        op.type == "read" or op.type == "update_loss_scaling":
                         continue
                     # We have processed lr related vars
                     if op.attr(self._op_role_key) == int(
@@ -4422,6 +4677,15 @@ class PipelineOptimizer(object):
                 read_block = prog.block(0)
                 read_device = self._get_device_info(read_block)
                 read_dev_index = int(read_device.split(':')[1])
+                pair = (write_dev_index, read_dev_index)
+                pair_key = write_dev_index * 1000 + read_dev_index
+                if pair not in self._pipeline_pair:
+                    self._pipeline_pair.append(pair)
+                    self._pp_ring_map[pair_key] = self.ring_id
+                    ring_id = self.ring_id
+                    self.ring_id += 1
+                else:
+                    ring_id = self._pp_ring_map[pair_key]
 
                 write_block._insert_op(
                     index=0,
@@ -4429,11 +4693,12 @@ class PipelineOptimizer(object):
                     inputs={'X': write_block.var(var_name), },
                     attrs={
                         self._op_device_key: write_device,
-                        'use_calc_stream': True,
+                        'use_calc_stream': False,
                         # A trick to make the role LRSched to avoid copy every
                         # microbatch
                         self._op_role_key: self._op_role.LRSched,
                         'peer': read_dev_index,
+                        'ring_id': ring_id
                     })
                 read_block._insert_op(
                     index=0,
@@ -4443,36 +4708,90 @@ class PipelineOptimizer(object):
                         'out_shape': read_block.var(var_name).shape,
                         'dtype': read_block.var(var_name).dtype,
                         self._op_device_key: read_device,
-                        'use_calc_stream': True,
+                        'use_calc_stream': False,
+                        # A trick to make the role LRSched to avoid copy every
+                        # microbatch
+                        self._op_role_key: self._op_role.LRSched,
+                        'peer': write_dev_index,
+                        'ring_id': ring_id
+                    })
+                read_block._insert_op(
+                    index=1,
+                    type='c_sync_comm_stream',
+                    inputs={'X': [read_block.var(var_name)]},
+                    outputs={'Out': [read_block.var(var_name)]},
+                    attrs={
+                        self._op_device_key: read_device,
                         # A trick to make the role LRSched to avoid copy every
                         # microbatch
                         self._op_role_key: self._op_role.LRSched,
-                        'peer': write_dev_index
+                        'ring_id': ring_id
                     })
 
+    def _is_gradient_clip_op(self, op):
+        return op.desc.has_attr("op_namescope") \
+            and op.desc.attr("op_namescope").startswith("/gradient_clip")
+
+    def _is_regularization_op(self, op):
+        return op.desc.has_attr("op_namescope") \
+            and op.desc.attr("op_namescope").startswith("/regularization")
+
+    def _get_input_output_info(self, block):
+        '''
+        Get info of op input and output.
+        '''
+        # A map from output var to op which generate it.
+        self.output_var_to_op = dict()
+        # A map from var to op which takes it as input.
+        self.input_var_to_op = dict()
+
+        for index, op in enumerate(list(block.ops)):
+            for var_name in op.input_arg_names:
+                ops = self.input_var_to_op.setdefault(var_name, [])
+                ops.append([op, index])
+            for var_name in op.output_arg_names:
+                ops = self.output_var_to_op.setdefault(var_name, [])
+                ops.append([op, index])
+
     def minimize(self,
                  loss,
                  startup_program=None,
                  parameter_list=None,
                  no_grad_set=None):
         main_block = loss.block
+        self.origin_main_block = main_block
+        main_program = main_block.program
         if startup_program is None:
             startup_program = default_startup_program()
-        optimize_ops, params_grads = self._optimizer.minimize(
-            loss, startup_program, parameter_list, no_grad_set)
-        self._param_device_map = self._optimizer._param_device_map
+
+        assert main_program._pipeline_opt, 'Please use pipeline with fleet.'
+        required_keys = [
+            'local_rank',
+            'schedule_mode',
+            'micro_batch_size',
+            'ring_id',
+            'global_ring_id',
+            'use_sharding',
+        ]
+        for key in required_keys:
+            assert key in main_program._pipeline_opt, \
+                'Please use pipeline with fleet to use {}.'.format(key)
+        self.local_rank = main_block.program._pipeline_opt['local_rank']
+        self.schedule_mode = main_block.program._pipeline_opt['schedule_mode']
         self.micro_batch_size = main_block.program._pipeline_opt[
             'micro_batch_size']
+        self.use_sharding = main_block.program._pipeline_opt['use_sharding']
+        self.ring_id = main_block.program._pipeline_opt['ring_id']
+        self.global_ring_id = main_block.program._pipeline_opt['global_ring_id']
 
-        # Step1: add default op_device attribute for regulization and clip ops
-        self._add_opdevice_attr_for_regularization_clip(main_block)
-
-        # Step2: add default op_device attribute for ops whose op_device
-        # attribute have not been set yet. Then check all ops have the
-        # op_device attribute.
-        self._add_default_opdevice_attr(main_block)
+        optimize_ops, params_grads = self._optimizer.minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+        self._param_device_map = self._origin_optimizer._param_device_map
 
-        device_specs = self._check_validation(main_block)
+        self._get_input_output_info(main_block)
+        # Step1: add default op_device attribute for ops.
+        self._add_op_device_attr(main_block)
+        device_list = self._check_validation(main_block)
 
         def device_cmp(device1, device2):
             dev1_id = int(device1.split(':')[1])
@@ -4484,74 +4803,74 @@ class PipelineOptimizer(object):
             else:
                 return 0
 
-        sorted_device_spec = sorted(device_specs, key=cmp_to_key(device_cmp))
-        assert sorted_device_spec == device_specs, (
-            "With pipeline "
-            "parallelism, you must use gpu devices one after another "
-            "in the order of their ids.")
-
-        # Step3: add send and recv ops between section boundaries
+        sorted_device_list = sorted(device_list, key=cmp_to_key(device_cmp))
+        assert sorted_device_list == device_list, (
+            "With pipeline parallelism, you must use gpu devices one after "
+            "another in the order of their ids.")
+        # Step2: add send and recv ops between section boundaries
         self._insert_sendrecv_ops_for_boundaries(main_block)
 
-        # Step4: split program into sections and add pairs of
+        # Step3: split program into sections and add pairs of
         # send and recv ops for data var.
         main_program = main_block.program
-        program_list = self._split_program(main_program, device_specs)
+        program_list = self._split_program(main_program, device_list)
         for p in program_list:
-            self._create_vars(p["program"].block(0),
-                              main_program.global_block())
-        self._insert_sendrecv_for_data_var(main_block, program_list,
-                                           startup_program, device_specs)
+            self._create_vars(p.global_block(), main_block)
 
-        # Step5: Special Case: process persistable vars that exist in
+        # Step4: Special Case: process persistable vars that exist in
         # multiple sections
-        self._process_persistable_vars_in_multi_sections(
-            main_program, startup_program, program_list)
+        # FIXME 
+        # self._process_persistable_vars_in_multi_sections(
+        #     main_program, startup_program, program_list)
 
-        # Step6: Add sub blocks for section programs
+        # Step5: Add sub blocks for section programs
         self._add_sub_blocks(main_block, program_list)
 
-        assert (main_program._pipeline_opt and
-                isinstance(main_program._pipeline_opt, dict) and
-                'local_rank' in main_program._pipeline_opt), \
-                "You must use pipeline with fleet"
-        local_rank = main_program._pipeline_opt['local_rank'] % len(
-            device_specs)
-
+        self.local_rank %= len(device_list)
         place_list = []
-        for dev_spec in device_specs:
-            dev_index = dev_spec.split(":")[1]
-            place_list.append(core.CUDAPlace(local_rank))
-
-        # Step7: Split startup program
+        for dev in device_list:
+            dev_index = int(dev.split(":")[1])
+            if core.is_compiled_with_cuda():
+                place_list.append(core.CUDAPlace(dev_index % 1))
+            elif core.is_compiled_with_npu():
+                place_list.append(core.NPUPlace(dev_index % 1))
+
+        # Step6: Split startup program
         new_startup_program = self._split_startup_program(startup_program,
-                                                          local_rank)
-
-        # Step8: clear gradients before each mini-batch and 
-        # accumulate gradients during backward
-        self._clear_gradients(
-            program_list[local_rank]['program'].global_block(),
-            dev_spec=device_specs[local_rank])
-        self._accumulate_gradients(program_list[local_rank]['program']
-                                   .global_block())
+                                                          self.local_rank)
 
         startup_program._pipeline_opt = {
             "startup_program": new_startup_program,
         }
-
-        place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        real_block = program_list[self.local_rank].global_block()
+        self._insert_loss_scale(real_block)
+        if not self.use_sharding:
+            # Step7: clear gradients before each mini-batch and 
+            # accumulate gradients during backward
+            self._rename_gradient_var_name(real_block)
+            real_block._sync_with_cpp()
+            self._accumulate_gradients(real_block)
+            real_block._sync_with_cpp()
+
+        if core.is_compiled_with_cuda():
+            place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        elif core.is_compiled_with_npu():
+            place_id = int(os.getenv("FLAGS_selected_npus", "0"))
         main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
-            "inner_parallelism": len(device_specs),
-            "section_program": program_list[local_rank],
-            "place": place_list[local_rank],
+            "pipeline_stage": self.local_rank,
+            "num_pipeline_stages": len(device_list),
+            "schedule_mode": self.schedule_mode,
+            "inner_parallelism": len(device_list),
+            "section_program": program_list[self.local_rank],
+            "place": place_list[self.local_rank],
             "place_id": place_id,
             "sync_steps": -1,
             "num_microbatches": self._num_microbatches,
             "start_cpu_core_id": self._start_cpu_core_id,
         }
-        return optimize_ops, params_grads, program_list
+        return optimize_ops, params_grads, program_list, self._pipeline_pair, self._pp_ring_map
 
 
 class RecomputeOptimizer(Optimizer):
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index bc7a60af94617c8ea4102ae30ccf0d04330d199b..40b0862be0177ec9ce90088bf48ff7a068868bec 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None):
         os.remove(config_file)
 
 
+@signature_safe_contextmanager
+def npu_profiler(output_file, config=None):
+    """
+    The NPU profiler.
+    
+    This fuctions is used to profile NPU program by NPU runtime application
+    programming interface. The profiling result will be written into
+    `output_file`. The users can set set the NPU profiling config by `config` argument. 
+    
+    After getting the profiling result file, users can use 
+    `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ 
+    to load this output file to visualize results.
+
+    Args:
+        output_file (str) : The output file name, the result will be
+            written into this file. It should be absolute path. 
+        config (list<str>, optional) : NPU profile config. For more details, please
+            refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.fluid.profiler as profiler
+            import numpy as np
+
+            epoc = 8
+            dshape = [4, 3, 28, 28]
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
+            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+            place = fluid.NPUPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            output_file = 'npu.txt'
+            with profiler.npu_profiler(output_file) as npu_prof:
+                for i in range(epoc):
+                    input = np.random.random(dshape).astype('float32')
+                    exe.run(fluid.default_main_program(), feed={'data': input})
+            # then use  NPU profiler tools to load this output file
+            # to visualize results.
+    """
+    # TODO: support config in python.
+    if not config:
+        config = core.npu_prof_create_config()
+
+    core.npu_prof_init(output_file)
+    # Enables profiler collection by the active NPU profiling tool.
+    core.npu_prof_start(config)
+    try:
+        yield
+    # Disables profiler collection.
+    finally:
+        core.npu_prof_stop(config)
+        core.npu_prof_finalize()
+
+
 def reset_profiler():
     """
     Clear the previous time record. This interface does not work for
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index be196b73edd69808812ced0927f5625a281bcf7b..9f2b2127aa7043546e84c8cc0295349108f407f5 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -165,6 +165,12 @@ class DataLoader(object):
 
     For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
 
+    .. note::
+        GPU tensor operation is not supported in subprocess currently,
+        please don't use GPU tensor operations in pipeline which will
+        be performed in subprocess, such as dataset transforms, collte_fn,
+        etc. Numpy array and CPU tensor operation is supported.
+
     **Disable automatic batching**
 
     In certain cases such as some NLP tasks, instead of automatic batching,
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 5e0e5f724a889aaf491b2c1bbb67e36cfaf9cd38..64ce283a63c5bf4e47c522c1581c48a1f11c85f1 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -28,10 +28,12 @@ def _create_regularization_of_grad(param, grad, regularization=None):
     Function helper of append_regularization_ops.
     """
     # If no gradient or no regularization is specified,  then we don't need to do anything
-    if grad is None or (param.regularizer is None and regularization is None):
+    if grad is None or ((not hasattr(param, 'regularizer') or (
+            hasattr(param, 'regularizer') and param.regularizer is None)) and
+                        regularization is None):
         return grad
     regularization_term = None
-    if param.regularizer is not None:
+    if hasattr(param, 'regularizer') and param.regularizer is not None:
         # Add variable for regularization term in grad block
         regularization_term = param.regularizer(param, grad, grad.block)
     elif regularization is not None:
@@ -213,7 +215,7 @@ class L2DecayRegularizer(WeightDecayRegularizer):
         Returns:
             new variable for weight decay
         """
-        assert isinstance(param, framework.Parameter)
+        assert isinstance(param, framework.Variable)
         assert isinstance(block, framework.Block)
 
         inputs = {"X": [param]}
@@ -320,23 +322,25 @@ class L1DecayRegularizer(WeightDecayRegularizer):
         Returns:
             new variable for weight decay
         """
-        assert isinstance(param, framework.Parameter)
+        assert isinstance(param, framework.Variable)
         assert isinstance(block, framework.Block)
 
         if framework.in_dygraph_mode():
+            sign = block.create_var(dtype=param.dtype, shape=param.shape)
             decay = block.create_var(dtype=param.dtype, shape=param.shape)
         else:
+            sign = block.create_var(
+                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
             decay = block.create_var(
                 dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
 
         # Append sign op
-        block.append_op(
-            type='sign', inputs={"X": param}, outputs={"Out": decay})
+        block.append_op(type='sign', inputs={"X": param}, outputs={"Out": sign})
 
         # Append scale op to the output of sign op
         block.append_op(
             type='scale',
-            inputs={"X": decay},
+            inputs={"X": sign},
             outputs={"Out": decay},
             attrs={"scale": self._regularization_coeff})
 
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 899d6ae7f0e314ee05ae67ef5639d9f79410a38f..d73c4e3acb9b0a1b32ca53119292bda740321e25 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -8,10 +8,6 @@ endforeach()
 
 add_subdirectory(unittests)
 add_subdirectory(book)
-
-# TODO: support New Custom OP on Mac
-if(NOT APPLE)
-  add_subdirectory(custom_op)
-endif()
+add_subdirectory(custom_op)
 
 set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index 6f717302468af3b57db69ffd6009563423481a33..09c650f16e2fb4ae8d8e53f9d184afb15dfe5b59 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -9,7 +9,7 @@ endforeach()
 set_tests_properties(test_word2vec_book PROPERTIES TIMEOUT 120)
 set_tests_properties(test_recognize_digits PROPERTIES TIMEOUT 120)
 set_tests_properties(test_image_classification PROPERTIES TIMEOUT 200)
-set_tests_properties(test_label_semantic_roles PROPERTIES TIMEOUT 120)
+set_tests_properties(test_label_semantic_roles PROPERTIES TIMEOUT 240)
 set_tests_properties(test_machine_translation PROPERTIES TIMEOUT 120)
 set_tests_properties(test_rnn_encoder_decoder PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fit_a_line PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 9a2cc4ab1a1b9071825f92d7ed50d9db6f13a385..12952462270f0098eae4daf22085e8cf3a6ad505 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import paddle.static.amp as amp
+
 import contextlib
 import numpy
 import unittest
@@ -26,18 +28,36 @@ import os
 paddle.enable_static()
 
 
-def train(use_cuda, save_dirname, is_local):
+def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
+    if use_bf16:
+        if not pure_bf16:
+            with amp.bf16.bf16_guard():
+                y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+        else:
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            with amp.bf16.bf16_guard():
+                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+                avg_cost = fluid.layers.mean(cost)
+    else:
+        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
+
+    if use_bf16:
+        sgd_optimizer = amp.bf16.decorate_bf16(
+            sgd_optimizer,
+            amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(),
+            use_bf16_guard=False,
+            use_pure_bf16=pure_bf16)
+    sgd_optimizer.minimize(
+        avg_cost, startup_program=fluid.default_startup_program())
 
     BATCH_SIZE = 20
 
@@ -52,6 +72,10 @@ def train(use_cuda, save_dirname, is_local):
     def train_loop(main_program):
         feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
         exe.run(fluid.default_startup_program())
+        test_prog = main_program.clone(for_test=True)
+        if pure_bf16:
+            sgd_optimizer.amp_init(
+                exe.place, test_program=test_prog, use_bf16_test=True)
 
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
@@ -59,9 +83,8 @@ def train(use_cuda, save_dirname, is_local):
                 avg_loss_value, = exe.run(main_program,
                                           feed=feeder.feed(data),
                                           fetch_list=[avg_cost])
-                print(avg_loss_value)
-                if avg_loss_value[0] < 10.0:
-                    if save_dirname is not None:
+                if avg_loss_value[0] < 10.0 or pure_bf16:
+                    if save_dirname is not None and not pure_bf16:
                         fluid.io.save_inference_model(save_dirname, ['x'],
                                                       [y_predict], exe)
                     return
@@ -95,7 +118,7 @@ def train(use_cuda, save_dirname, is_local):
             train_loop(t.get_trainer_program())
 
 
-def infer(use_cuda, save_dirname=None):
+def infer(use_cuda, save_dirname=None, use_bf16=False):
     if save_dirname is None:
         return
 
@@ -133,26 +156,21 @@ def infer(use_cuda, save_dirname=None):
         print("ground truth: ", test_label)
 
 
-def main(use_cuda, is_local=True):
+def main(use_cuda, is_local=True, use_bf16=False, pure_bf16=False):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
+    if use_bf16 and not fluid.core.is_compiled_with_mkldnn():
+        return
+
     # Directory for saving the trained model
     save_dirname = "fit_a_line.inference.model"
 
-    train(use_cuda, save_dirname, is_local)
-    infer(use_cuda, save_dirname)
-
+    train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16)
+    infer(use_cuda, save_dirname, use_bf16)
 
-class TestFitALine(unittest.TestCase):
-    def test_cpu(self):
-        with self.program_scope_guard():
-            main(use_cuda=False)
-
-    def test_cuda(self):
-        with self.program_scope_guard():
-            main(use_cuda=True)
 
+class TestFitALineBase(unittest.TestCase):
     @contextlib.contextmanager
     def program_scope_guard(self):
         prog = fluid.Program()
@@ -163,5 +181,27 @@ class TestFitALine(unittest.TestCase):
                 yield
 
 
+class TestFitALine(TestFitALineBase):
+    def test_cpu(self):
+        with self.program_scope_guard():
+            main(use_cuda=False)
+
+    def test_cuda(self):
+        with self.program_scope_guard():
+            main(use_cuda=True)
+
+
+@unittest.skipIf(not fluid.core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestFitALineBF16(TestFitALineBase):
+    def test_bf16(self):
+        with self.program_scope_guard():
+            main(use_cuda=False, use_bf16=True)
+
+    def test_pure_bf16(self):
+        with self.program_scope_guard():
+            main(use_cuda=False, use_bf16=True, pure_bf16=True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py
index e33b1cc514aa6c7871da714d9c65c37545d39f72..650ccc0776a50cffc9a5b0ad10e3a2db57b00328 100644
--- a/python/paddle/fluid/tests/book/test_word2vec_book.py
+++ b/python/paddle/fluid/tests/book/test_word2vec_book.py
@@ -39,7 +39,13 @@ def get_place(target):
             format(target))
 
 
-def train(target, is_sparse, is_parallel, save_dirname, is_local=True):
+def train(target,
+          is_sparse,
+          is_parallel,
+          save_dirname,
+          is_local=True,
+          use_bf16=False,
+          pure_bf16=False):
     PASS_NUM = 100
     EMBED_SIZE = 32
     HIDDEN_SIZE = 256
@@ -101,7 +107,15 @@ def train(target, is_sparse, is_parallel, save_dirname, is_local=True):
         raise NotImplementedError()
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-    sgd_optimizer.minimize(avg_cost)
+    if use_bf16:
+        sgd_optimizer = paddle.static.amp.bf16.decorate_bf16(
+            sgd_optimizer,
+            amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16(
+                custom_fp32_list={'softmax', 'concat'}, ),
+            use_bf16_guard=False,
+            use_pure_bf16=pure_bf16)
+
+    sgd_optimizer.minimize(avg_cost, fluid.default_startup_program())
 
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
@@ -114,6 +128,8 @@ def train(target, is_sparse, is_parallel, save_dirname, is_local=True):
 
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
+        if pure_bf16:
+            sgd_optimizer.amp_init(exe.place)
 
         for pass_id in range(PASS_NUM):
             for data in train_reader():
@@ -121,7 +137,7 @@ def train(target, is_sparse, is_parallel, save_dirname, is_local=True):
                                       feed=feeder.feed(data),
                                       fetch_list=[avg_cost])
                 if avg_cost_np[0] < 5.0:
-                    if save_dirname is not None:
+                    if save_dirname is not None and not pure_bf16:
                         fluid.io.save_inference_model(save_dirname, [
                             'firstw', 'secondw', 'thirdw', 'forthw'
                         ], [predict_word], exe)
@@ -239,12 +255,15 @@ def infer(target, save_dirname=None):
             assert np.isclose(a, b, rtol=5e-5), "a: {}, b: {}".format(a, b)
 
 
-def main(target, is_sparse, is_parallel):
+def main(target, is_sparse, is_parallel, use_bf16, pure_bf16):
     if target == "cuda" and not fluid.core.is_compiled_with_cuda():
         return
     if target == "xpu" and not fluid.core.is_compiled_with_xpu():
         return
 
+    if use_bf16 and not fluid.core.is_compiled_with_mkldnn():
+        return
+
     if not is_parallel:
         save_dirname = "word2vec.inference.model"
     else:
@@ -255,7 +274,13 @@ def main(target, is_sparse, is_parallel):
         # so only inference is turned on.
         train("cpu", is_sparse, is_parallel, save_dirname)
     else:
-        train(target, is_sparse, is_parallel, save_dirname)
+        train(
+            target,
+            is_sparse,
+            is_parallel,
+            save_dirname,
+            use_bf16=use_bf16,
+            pure_bf16=pure_bf16)
     infer(target, save_dirname)
 
 
@@ -268,10 +293,16 @@ class W2VTest(unittest.TestCase):
     pass
 
 
-def inject_test_method(target, is_sparse, is_parallel):
-    fn_name = "test_{0}_{1}_{2}".format(target, "sparse"
-                                        if is_sparse else "dense", "parallel"
-                                        if is_parallel else "normal")
+def inject_test_method(target,
+                       is_sparse,
+                       is_parallel,
+                       use_bf16=False,
+                       pure_bf16=False):
+    fn_name = "test_{0}_{1}_{2}{3}".format(target, "sparse"
+                                           if is_sparse else "dense", "parallel"
+                                           if is_parallel else "normal",
+                                           "_purebf16" if pure_bf16 else "_bf16"
+                                           if use_bf16 else "")
 
     def __impl__(*args, **kwargs):
         prog = fluid.Program()
@@ -279,8 +310,7 @@ def inject_test_method(target, is_sparse, is_parallel):
         scope = fluid.core.Scope()
         with fluid.scope_guard(scope):
             with fluid.program_guard(prog, startup_prog):
-                main(
-                    target=target, is_sparse=is_sparse, is_parallel=is_parallel)
+                main(target, is_sparse, is_parallel, use_bf16, pure_bf16)
 
     if (not fluid.core.is_compiled_with_cuda() or
             target == "cuda") and is_sparse:
@@ -297,6 +327,8 @@ for target in ("cuda", "cpu", "xpu"):
     for is_sparse in (False, True):
         for is_parallel in (False, ):
             inject_test_method(target, is_sparse, is_parallel)
+inject_test_method("cpu", False, False, True)
+inject_test_method("cpu", False, False, True, True)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index f57d22d87109f6aa5570bb33835233b2feba37ae..2092151b84f454de581601e2de630e9789bfe429 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -1,6 +1,5 @@
-# New custom OP can support Windows/Linux now
-if(WITH_GPU)
-    # 'test_custom_relu_op_setup/jit' compile .cc and .cu file
+# New custom OP can support Windows/Linux/Mac now
+if(WITH_GPU OR APPLE)
     py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
     py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
     py_test(test_custom_relu_model SRCS test_custom_relu_model.py)
@@ -11,54 +10,14 @@ if(WITH_GPU)
     set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180)
 endif()
 
-py_test(test_sysconfig SRCS test_sysconfig.py)
-
-# 'test_dispatch' compile .cc file
+# CPU custom op tests: only compile .cc file
 py_test(test_dispatch_jit SRCS test_dispatch_jit.py)
-set_tests_properties(test_dispatch_jit PROPERTIES TIMEOUT 120)
-
 py_test(test_multi_out_jit SRCS test_multi_out_jit.py)
-set_tests_properties(test_multi_out_jit PROPERTIES TIMEOUT 120)
-
 py_test(test_custom_attrs_jit SRCS test_custom_attrs_jit.py)
-set_tests_properties(test_custom_attrs_jit PROPERTIES TIMEOUT 120)
+py_test(test_custom_concat SRCS test_custom_concat.py)
+py_test(test_custom_conj SRCS test_custom_conj.py)
 
+# other tests
+py_test(test_sysconfig SRCS test_sysconfig.py)
 py_test(test_check_abi SRCS test_check_abi.py)
-
 cc_test(test_check_error SRCS test_check_error.cc DEPS gtest)
-
-if(NOT LINUX)
-    return()
-endif()
-
-# Old custom OP only support Linux, only run on Linux
-py_test(test_custom_op SRCS test_custom_op.py)
-py_test(test_jit_load SRCS test_jit_load.py)
-py_test(test_setup_install SRCS test_setup_install.py)
-py_test(test_setup_build SRCS test_setup_build.py)
-
-set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_install PROPERTIES TIMEOUT 250)
-set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
-
-
-if(WITH_ROCM)
-    hip_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
-elseif(WITH_GPU)
-    nv_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
-else()
-    cc_library(relu_op_shared SHARED SRCS relu_op.cc DEPS paddle_framework_shared)
-endif()
-set_target_properties(relu_op_shared PROPERTIES OUTPUT_NAME relu2_op)
-target_link_libraries(relu_op_shared ${FLUID_FRAMEWORK_SHARED_LIB})
-
-# remove the linked glog and gflags when compling relu_op_shared
-# otherwise, there is running error:
-# ERROR: something wrong with flag 'logtostderr' in file
-# 'third_party/glog/src/extern_glog/src/logging.cc'.
-# One possibility: file 'third_party/glog/src/extern_glog/src/logging.cc'
-# is being linked both statically and dynamically into this executable.
-get_target_property(TARGET_LIBRARIES relu_op_shared LINK_LIBRARIES)
-LIST(REMOVE_ITEM TARGET_LIBRARIES glog)
-LIST(REMOVE_ITEM TARGET_LIBRARIES gflags)
-set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES} )
diff --git a/python/paddle/fluid/tests/custom_op/attr_test_op.cc b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
index 97aae10613734948e65a76b6854bfffe9bed45a7..1edc10b8a8a981532cfe61875f3e792380487a76 100644
--- a/python/paddle/fluid/tests/custom_op/attr_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/attr_test_op.cc
@@ -27,27 +27,15 @@ void assign_cpu_kernel(const data_t* x_data,
   }
 }
 
-std::vector<paddle::Tensor> AttrTestForward(
-    const paddle::Tensor& x,
-    bool bool_attr,
-    int int_attr,
-    float float_attr,
-    int64_t int64_attr,
-    std::string str_attr,
-    std::vector<int> int_vec_attr,
-    std::vector<float> float_vec_attr,
-    std::vector<int64_t> int64_vec_attr,
-    std::vector<std::string> str_vec_attr) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.reshape(x.shape());
-
-  PD_DISPATCH_FLOATING_TYPES(
-      x.type(), "assign_cpu_kernel", ([&] {
-        assign_cpu_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
-      }));
-
-  // Check attrs value
+void CheckAllForwardAttrs(const bool& bool_attr,
+                          const int& int_attr,
+                          const float& float_attr,
+                          const int64_t& int64_attr,
+                          const std::string& str_attr,
+                          const std::vector<int>& int_vec_attr,
+                          const std::vector<float>& float_vec_attr,
+                          const std::vector<int64_t>& int64_vec_attr,
+                          const std::vector<std::string>& str_vec_attr) {
   if (bool_attr != true) {
     throw std::runtime_error("bool_attr value error.");
   }
@@ -103,26 +91,11 @@ std::vector<paddle::Tensor> AttrTestForward(
       }
     }
   }
-
-  return {out};
 }
 
-// The attrs of backward op must be the subset of attrs of forward op
-std::vector<paddle::Tensor> AttrTestBackward(
-    const paddle::Tensor& grad_out,
-    int int_attr,
-    std::vector<float> float_vec_attr,
-    std::vector<std::string> str_vec_attr) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
-  grad_x.reshape(grad_out.shape());
-
-  PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
-                               assign_cpu_kernel<data_t>(
-                                   grad_out.data<data_t>(),
-                                   grad_x.mutable_data<data_t>(),
-                                   grad_out.size());
-                             }));
-
+void CheckAllBackwardAttrs(const int& int_attr,
+                           const std::vector<float>& float_vec_attr,
+                           const std::vector<std::string>& str_vec_attr) {
   if (int_attr != 10) {
     throw std::runtime_error("int_attr value error.");
   }
@@ -146,6 +119,114 @@ std::vector<paddle::Tensor> AttrTestBackward(
       }
     }
   }
+}
+
+std::vector<paddle::Tensor> AttrTestForward(
+    const paddle::Tensor& x,
+    bool bool_attr,
+    int int_attr,
+    float float_attr,
+    int64_t int64_attr,
+    std::string str_attr,
+    std::vector<int> int_vec_attr,
+    std::vector<float> float_vec_attr,
+    std::vector<int64_t> int64_vec_attr,
+    std::vector<std::string> str_vec_attr) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  // Check attrs value
+  CheckAllForwardAttrs(bool_attr,
+                       int_attr,
+                       float_attr,
+                       int64_attr,
+                       str_attr,
+                       int_vec_attr,
+                       float_vec_attr,
+                       int64_vec_attr,
+                       str_vec_attr);
+
+  return {out};
+}
+
+// The attrs of backward op must be the subset of attrs of forward op
+std::vector<paddle::Tensor> AttrTestBackward(
+    const paddle::Tensor& grad_out,
+    int int_attr,
+    std::vector<float> float_vec_attr,
+    std::vector<std::string> str_vec_attr) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
+  grad_x.reshape(grad_out.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
+                               assign_cpu_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(),
+                                   grad_out.size());
+                             }));
+
+  CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr);
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> ConstAttrTestForward(
+    const paddle::Tensor& x,
+    const bool& bool_attr,
+    const int& int_attr,
+    const float& float_attr,
+    const int64_t& int64_attr,
+    const std::string& str_attr,
+    const std::vector<int>& int_vec_attr,
+    const std::vector<float>& float_vec_attr,
+    const std::vector<int64_t>& int64_vec_attr,
+    const std::vector<std::string>& str_vec_attr) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  // Check attrs value
+  CheckAllForwardAttrs(bool_attr,
+                       int_attr,
+                       float_attr,
+                       int64_attr,
+                       str_attr,
+                       int_vec_attr,
+                       float_vec_attr,
+                       int64_vec_attr,
+                       str_vec_attr);
+
+  return {out};
+}
+
+// The attrs of backward op must be the subset of attrs of forward op
+std::vector<paddle::Tensor> ConstAttrTestBackward(
+    const paddle::Tensor& grad_out,
+    const int& int_attr,
+    const std::vector<float>& float_vec_attr,
+    const std::vector<std::string>& str_vec_attr) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
+  grad_x.reshape(grad_out.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] {
+                               assign_cpu_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(),
+                                   grad_out.size());
+                             }));
+
+  CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr);
 
   return {grad_x};
 }
@@ -171,3 +252,25 @@ PD_BUILD_GRAD_OP(attr_test)
             "float_vec_attr: std::vector<float>",
             "str_vec_attr: std::vector<std::string>"})
     .SetKernelFn(PD_KERNEL(AttrTestBackward));
+
+PD_BUILD_OP(const_attr_test)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .Attrs({"bool_attr: bool",
+            "int_attr: int",
+            "float_attr: float",
+            "int64_attr: int64_t",
+            "str_attr: std::string",
+            "int_vec_attr: std::vector<int>",
+            "float_vec_attr: std::vector<float>",
+            "int64_vec_attr: std::vector<int64_t>",
+            "str_vec_attr: std::vector<std::string>"})
+    .SetKernelFn(PD_KERNEL(AttrTestForward));
+
+PD_BUILD_GRAD_OP(const_attr_test)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .Attrs({"int_attr: int",
+            "float_vec_attr: std::vector<float>",
+            "str_vec_attr: std::vector<std::string>"})
+    .SetKernelFn(PD_KERNEL(AttrTestBackward));
diff --git a/python/paddle/fluid/tests/custom_op/concat_and_split.h b/python/paddle/fluid/tests/custom_op/concat_and_split.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f24cc43699773fc531ccd68d4219ebcdfdab8eb
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/concat_and_split.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include "paddle/extension.h"
+
+int64_t GetRows(std::vector<int64_t> shape, int64_t axis) {
+  int64_t rows = 1;
+  for (int64_t i = 0; i < axis; ++i) {
+    rows *= shape[i];
+  }
+  return rows;
+}
+
+std::vector<int64_t> GetCols(const std::vector<paddle::Tensor>& ins,
+                             int64_t rows,
+                             int64_t* cols) {
+  std::vector<int64_t> cols_vec(ins.size());
+  for (size_t i = 0; i < ins.size(); ++i) {
+    int64_t t_cols = ins[i].size() / rows;
+    *cols += t_cols;
+    cols_vec[i] = t_cols;
+  }
+  return cols_vec;
+}
+
+template <typename data_t>
+void ConcatCpuKernel(const std::vector<paddle::Tensor>& ins,
+                     paddle::Tensor* out,
+                     int64_t axis) {
+  size_t num = ins.size();
+  int64_t out_rows = GetRows(ins[0].shape(), axis);
+  int64_t out_cols = 0;
+  auto ins_cols = GetCols(ins, out_rows, &out_cols);
+
+  auto* out_data = out->mutable_data<data_t>();
+  int64_t col_idx = 0;
+  for (size_t i = 0; i < num; ++i) {
+    int64_t col_len = ins_cols[i];
+    auto* in_data = ins[i].data<data_t>();
+    for (int j = 0; j < out_rows; ++j) {
+      std::memcpy(out_data + j * out_cols + col_idx,
+                  in_data + j * col_len,
+                  sizeof(data_t) * col_len);
+    }
+    col_idx += col_len;
+  }
+}
+
+template <typename data_t>
+void SplitCpuKernel(const paddle::Tensor& in,
+                    const std::vector<paddle::Tensor>& ref_ins,
+                    std::vector<paddle::Tensor>* outs,
+                    int64_t axis) {
+  size_t num = outs->size();
+  int64_t in_rows = GetRows(ref_ins[0].shape(), axis);
+  int64_t in_cols = 0;
+  auto out_cols = GetCols(ref_ins, in_rows, &in_cols);
+
+  for (size_t i = 0; i < in_rows; ++i) {
+    auto* in_data = in.data<data_t>() + i * in_cols;
+    int64_t col_idx = 0;
+    for (size_t j = 0; j < num; ++j) {
+      int64_t col_len = out_cols[j];
+      auto* out_data = outs->at(j).mutable_data<data_t>() + i * col_len;
+      std::memcpy(out_data, in_data + col_idx, sizeof(data_t) * col_len);
+      col_idx += col_len;
+    }
+  }
+}
diff --git a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a01e01f2bc59239e5ce6aec4a1d9ea9a27bc00d1
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
@@ -0,0 +1,236 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "concat_and_split.h"  // NOLINT
+#include "paddle/extension.h"
+
+#define CHECK_INPUT(x) \
+  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+
+int64_t ComputeAxis(int64_t axis, int64_t rank) {
+  PD_CHECK(axis >= -rank && axis < rank,
+           "The axis is excepted to be in range of [",
+           -rank,
+           ", ",
+           rank,
+           "].");
+  if (axis < 0) {
+    axis = axis + rank;
+  }
+  return axis > 0 ? axis : 0;
+}
+
+std::vector<int64_t> ComputeOutShape(
+    std::vector<std::vector<int64_t>> in_shapes, int64_t axis) {
+  size_t n = in_shapes.size();
+  auto out_shape = in_shapes[0];
+  size_t zero_dim_size = out_shape.size();
+  for (size_t i = 1; i < n; ++i) {
+    PD_CHECK(in_shapes[i].size() == out_shape.size(),
+             "Input dimension must be same.");
+    for (size_t j = 0; j < zero_dim_size; ++j) {
+      if (j == axis) {
+        out_shape[axis] += in_shapes[i][j];
+      } else {
+        PD_CHECK(in_shapes[0][j] == in_shapes[i][j],
+                 "The ",
+                 j,
+                 "-th dimension of input must be same.");
+      }
+    }
+  }
+  return out_shape;
+}
+
+std::vector<paddle::Tensor> ConcatForwardDynamicAxis(
+    const std::vector<paddle::Tensor>& inputs, const paddle::Tensor& axis_t) {
+  // check inputs
+  PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat.");
+  for (auto& t : inputs) {
+    CHECK_INPUT(t);
+  }
+  CHECK_INPUT(axis_t);
+
+  // compute output shape
+  int64_t rank = static_cast<int64_t>(inputs[0].shape().size());
+  int64_t axis = axis_t.data<int64_t>()[0];
+  axis = ComputeAxis(axis, rank);
+  std::vector<std::vector<int64_t>> in_shapes;
+  for (auto& t : inputs) {
+    in_shapes.emplace_back(t.shape());
+  }
+  auto out_shape = ComputeOutShape(in_shapes, axis);
+
+  // create output
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(out_shape);
+
+  // calc
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
+      inputs[0].type(), "ConcatCpuKernel", ([&] {
+        ConcatCpuKernel<data_t>(inputs, &out, axis);
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> ConcatBackwardDynamicAxis(
+    const std::vector<paddle::Tensor>& inputs,
+    const paddle::Tensor& grad_out,
+    const paddle::Tensor& axis_t) {
+  // check input
+  PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat.");
+  for (auto& t : inputs) {
+    CHECK_INPUT(t);
+  }
+  CHECK_INPUT(axis_t);
+  CHECK_INPUT(grad_out);
+
+  // compate axis
+  int64_t rank = static_cast<int64_t>(inputs[0].shape().size());
+  int64_t axis = axis_t.data<int64_t>()[0];
+  axis = ComputeAxis(axis, rank);
+
+  // create outputs
+  std::vector<paddle::Tensor> grad_inputs;
+  for (auto& t : inputs) {
+    auto grad = paddle::Tensor(paddle::PlaceType::kCPU);
+    grad.reshape(t.shape());
+    grad_inputs.emplace_back(grad);
+  }
+
+  // calc
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
+      grad_out.type(), "SplitCpuKernel", ([&] {
+        SplitCpuKernel<data_t>(grad_out, inputs, &grad_inputs, axis);
+      }));
+
+  return grad_inputs;
+}
+
+std::vector<std::vector<int64_t>> ConcatInferShapeDynamicAxis(
+    const std::vector<std::vector<int64_t>>& input_shapes,
+    const std::vector<int64_t>& axis_shape) {
+  return {std::vector<int64_t>(input_shapes[0].size(), -1)};
+}
+
+std::vector<paddle::DataType> ConcatInferDtypeDynamicAxis(
+    const std::vector<paddle::DataType>& input_dtypes,
+    const paddle::DataType& axis_dtype) {
+  return {input_dtypes[0]};
+}
+
+PD_BUILD_OP(custom_concat)
+    .Inputs({paddle::Vec("X"), "Axis"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ConcatForwardDynamicAxis))
+    .SetInferShapeFn(PD_INFER_SHAPE(ConcatInferShapeDynamicAxis))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ConcatInferDtypeDynamicAxis));
+
+PD_BUILD_GRAD_OP(custom_concat)
+    .Inputs({paddle::Vec("X"), paddle::Grad("Out"), "Axis"})
+    .Outputs({paddle::Grad(paddle::Vec("X"))})
+    .SetKernelFn(PD_KERNEL(ConcatBackwardDynamicAxis));
+
+std::vector<paddle::Tensor> ConcatForwardStaticAxis(
+    const std::vector<paddle::Tensor>& inputs, const int64_t& axis) {
+  // check inputs
+  PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat.");
+  for (auto& t : inputs) {
+    CHECK_INPUT(t);
+  }
+
+  // compute output shape
+  int64_t rank = static_cast<int64_t>(inputs[0].shape().size());
+  auto final_axis = ComputeAxis(axis, rank);
+  std::vector<std::vector<int64_t>> in_shapes;
+  for (auto& t : inputs) {
+    in_shapes.emplace_back(t.shape());
+  }
+  auto out_shape = ComputeOutShape(in_shapes, final_axis);
+
+  // create output
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(out_shape);
+
+  // calc
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
+      inputs[0].type(), "ConcatCpuKernel", ([&] {
+        ConcatCpuKernel<data_t>(inputs, &out, final_axis);
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> ConcatBackwardStaticAxis(
+    const std::vector<paddle::Tensor>& inputs,
+    const paddle::Tensor& grad_out,
+    const int64_t& axis) {
+  // check input
+  PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat.");
+  for (auto& t : inputs) {
+    CHECK_INPUT(t);
+  }
+  CHECK_INPUT(grad_out);
+
+  // compate axis
+  int64_t rank = static_cast<int64_t>(inputs[0].shape().size());
+  auto final_axis = ComputeAxis(axis, rank);
+
+  // create outputs
+  std::vector<paddle::Tensor> grad_inputs;
+  for (auto& t : inputs) {
+    auto grad = paddle::Tensor(paddle::PlaceType::kCPU);
+    grad.reshape(t.shape());
+    grad_inputs.emplace_back(grad);
+  }
+
+  // calc
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
+      grad_out.type(), "SplitCpuKernel", ([&] {
+        SplitCpuKernel<data_t>(grad_out, inputs, &grad_inputs, final_axis);
+      }));
+
+  return grad_inputs;
+}
+
+std::vector<std::vector<int64_t>> ConcatInferShapeStaticAxis(
+    const std::vector<std::vector<int64_t>>& input_shapes,
+    const int64_t& axis) {
+  int64_t rank = static_cast<int64_t>(input_shapes[0].size());
+  auto final_axis = ComputeAxis(axis, rank);
+  auto out_shape = ComputeOutShape(input_shapes, final_axis);
+  return {out_shape};
+}
+
+std::vector<paddle::DataType> ConcatInferDtypeStaticAxis(
+    const std::vector<paddle::DataType>& input_dtypes) {
+  return {input_dtypes[0]};
+}
+
+PD_BUILD_OP(custom_concat_with_attr)
+    .Inputs({paddle::Vec("X")})
+    .Outputs({"Out"})
+    .Attrs({"axis: int64_t"})
+    .SetKernelFn(PD_KERNEL(ConcatForwardStaticAxis))
+    .SetInferShapeFn(PD_INFER_SHAPE(ConcatInferShapeStaticAxis))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ConcatInferDtypeStaticAxis));
+
+PD_BUILD_GRAD_OP(custom_concat_with_attr)
+    .Inputs({paddle::Vec("X"), paddle::Grad("Out")})
+    .Outputs({paddle::Grad(paddle::Vec("X"))})
+    .Attrs({"axis: int64_t"})
+    .SetKernelFn(PD_KERNEL(ConcatBackwardStaticAxis));
diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4feb887ca036a1c911b3a2890162a839f836dacf
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WIdata_tHOUdata_t WARRANdata_tIES OR CONDIdata_tIONS OF ANY KIND, either
+// express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+#define CHECK_INPUT(x) \
+  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+
+template <typename data_t>
+using EnableComplex = typename std::enable_if<
+    std::is_same<data_t, paddle::complex64>::value ||
+    std::is_same<data_t, paddle::complex128>::value>::type;
+
+template <typename data_t>
+using DisableComplex = typename std::enable_if<
+    !std::is_same<data_t, paddle::complex64>::value &&
+    !std::is_same<data_t, paddle::complex128>::value>::type;
+
+template <typename data_t, typename Enable = void>
+struct ConjFunctor;
+
+template <typename data_t>
+struct ConjFunctor<data_t, EnableComplex<data_t>> {
+  ConjFunctor(const data_t* input, int64_t numel, data_t* output)
+      : input_(input), numel_(numel), output_(output) {}
+
+  void operator()(size_t idx) const {
+    output_[idx] = data_t(input_[idx].real, -input_[idx].imag);
+  }
+
+  const data_t* input_;
+  int64_t numel_;
+  data_t* output_;
+};
+
+template <typename data_t>
+struct ConjFunctor<data_t, DisableComplex<data_t>> {
+  ConjFunctor(const data_t* input, int64_t numel, data_t* output)
+      : input_(input), numel_(numel), output_(output) {}
+
+  void operator()(size_t idx) const { output_[idx] = input_[idx]; }
+
+  const data_t* input_;
+  int64_t numel_;
+  data_t* output_;
+};
+
+template <typename data_t>
+void ConjCPUKernel(const data_t* x_data, int64_t numel, data_t* out_data) {
+  ConjFunctor<data_t> conj(x_data, numel, out_data);
+  for (int64_t i = 0; i < numel; ++i) {
+    conj(i);
+  }
+}
+
+std::vector<paddle::Tensor> ConjFunction(const paddle::Tensor& x) {
+  CHECK_INPUT(x);
+
+  paddle::Tensor out(x.place());
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      x.type(), "ConjCPUKernel", ([&] {
+        ConjCPUKernel<data_t>(
+            x.data<data_t>(), x.size(), out.mutable_data<data_t>());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(custom_conj)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ConjFunction));
+
+PD_BUILD_GRAD_OP(custom_conj)
+    .Inputs({paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ConjFunction));
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index c0b30a1cb5579c166845b8184ccb65bf515518ba..c075d27f7b1763babf54cd9d378b28f29fd84566 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -38,9 +38,8 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
 }
 
 std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
 
-  out.reshape(x.shape());
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index be3309d84f57d6f4f000f920339b06dc370c85a8..38e8e71cf8129b0e3b8ea7d816e6389c52c83a9e 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -20,7 +20,7 @@ __global__ void relu_cuda_forward_kernel(const data_t* x,
                                          const int num) {
   int gid = blockIdx.x * blockDim.x + threadIdx.x;
   for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = max(x[i], static_cast<data_t>(0.));
+    y[i] = x[i] > static_cast<data_t>(0.) ? x[i] : static_cast<data_t>(0.);
   }
 }
 
@@ -31,7 +31,8 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
                                           const int num) {
   int gid = blockIdx.x * blockDim.x + threadIdx.x;
   for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
+    dx[i] = dy[i] * (y[i] > static_cast<data_t>(0.) ? static_cast<data_t>(1.)
+                                                    : static_cast<data_t>(0.));
   }
 }
 
@@ -42,10 +43,14 @@ std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
   int numel = x.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
-  PD_DISPATCH_FLOATING_TYPES(
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
+        auto cpu_input = x.copy_to<data_t>(paddle::PlaceType::kCPU);
+        auto gpu_input = cpu_input.copy_to<data_t>(paddle::PlaceType::kGPU);
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+            gpu_input.data<data_t>(),
+            out.mutable_data<data_t>(x.place()),
+            numel);
       }));
 
   return {out};
@@ -60,7 +65,7 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
   int numel = out.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
-  PD_DISPATCH_FLOATING_TYPES(
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       out.type(), "relu_cuda_backward_kernel", ([&] {
         relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
             grad_out.data<data_t>(),
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_setup.py b/python/paddle/fluid/tests/custom_op/custom_relu_setup.py
index 598b850c876e2ff341d47e1afcf7ec6534163865..cbc4d17a4c72b9faa4312999afd84db50546b26b 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_setup.py
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_setup.py
@@ -14,17 +14,21 @@
 
 import os
 
-from utils import paddle_includes, extra_compile_args
-from paddle.utils.cpp_extension import CUDAExtension, setup
+from utils import paddle_includes, extra_compile_args, IS_MAC
+from paddle.utils.cpp_extension import CUDAExtension, setup, CppExtension
+
+# Mac-CI don't support GPU
+Extension = CppExtension if IS_MAC else CUDAExtension
+sources = ['custom_relu_op.cc', 'custom_relu_op_dup.cc']
+if not IS_MAC:
+    sources.append('custom_relu_op.cu')
 
 # custom_relu_op_dup.cc is only used for multi ops test,
 # not a new op, if you want to test only one op, remove this
 # source file
 setup(
     name='custom_relu_module_setup',
-    ext_modules=CUDAExtension(  # test for not specific name here.
-        sources=[
-            'custom_relu_op.cc', 'custom_relu_op.cu', 'custom_relu_op_dup.cc'
-        ],  # test for multi ops
+    ext_modules=Extension(  # test for not specific name here.
+        sources=sources,  # test for multi ops
         include_dirs=paddle_includes,
         extra_compile_args=extra_compile_args))
diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
index 33ca6ee86f02ec007a04d2125f9aabdbea0ea620..0435f50b7c701e997c9e73ba4bfd9a8c5a998471 100644
--- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -62,3 +62,77 @@ PD_BUILD_OP(dispatch_test_float_and_integer)
     .Inputs({"X"})
     .Outputs({"Out"})
     .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger));
+
+std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(dispatch_test_complex)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestComplex));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(dispatch_test_float_and_complex)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndComplex));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(dispatch_test_float_and_integer_and_complex)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndHalf(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_HALF_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP(dispatch_test_float_and_half)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndHalf));
diff --git a/python/paddle/fluid/tests/custom_op/relu_op.cc b/python/paddle/fluid/tests/custom_op/relu_op.cc
deleted file mode 100644
index 837f5bab6bef66d2bad3b8f70babde8980327b1b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class Relu2Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Y", in_dims);
-  }
-};
-
-class Relu2OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor.");
-    AddOutput("Y", "Output of relu_op");
-    AddComment(R"DOC(
-Relu2 Operator.
-)DOC");
-  }
-};
-
-class Relu2GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-  }
-};
-
-template <typename T>
-class Relu2GradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("relu2_grad");
-    op->SetInput("Y", this->Output("Y"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class Relu2Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < in_t->numel(); ++i) {
-      y[i] = std::max(static_cast<T>(0.), x[i]);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu2GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    for (int i = 0; i < y_t->numel(); ++i) {
-      dx[i] = dy[i] * (y[i] > static_cast<T>(0) ? 1. : 0.);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OPERATOR(relu2,
-                  ops::Relu2Op,
-                  ops::Relu2OpMaker,
-                  ops::Relu2GradMaker<paddle::framework::OpDesc>,
-                  ops::Relu2GradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(relu2_grad, ops::Relu2GradOp);
-REGISTER_OP_CPU_KERNEL(relu2,
-                       ops::Relu2Kernel<CPU, float>,
-                       ops::Relu2Kernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(relu2_grad,
-                       ops::Relu2GradKernel<CPU, float>,
-                       ops::Relu2GradKernel<CPU, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op.cu b/python/paddle/fluid/tests/custom_op/relu_op.cu
deleted file mode 100644
index 53ad75e413d92aef61e5bfe6348581e6428bbc3a..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeRelu2(const T* x, const int num, T* y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = max(x[i], static_cast<T>(0.));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu2CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = in_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu2<T><<<grid, block, 0, dev_ctx.stream()>>>(x, num, y);
-  }
-};
-
-template <typename T>
-__global__ void KeRelu2Grad(const T* y, const T* dy, const int num, T* dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu2GradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = dy_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu2Grad<T><<<grid, block, 0, dev_ctx.stream()>>>(y, dy, num, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(relu2,
-                        paddle::operators::Relu2CUDAKernel<CUDA, float>,
-                        paddle::operators::Relu2CUDAKernel<CUDA, double>);
-
-REGISTER_OP_CUDA_KERNEL(relu2_grad,
-                        paddle::operators::Relu2GradCUDAKernel<CUDA, float>,
-                        paddle::operators::Relu2GradCUDAKernel<CUDA, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cc b/python/paddle/fluid/tests/custom_op/relu_op3.cc
deleted file mode 100644
index ace9598c586866edd4be8cf99e4d3a783e18788b..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op3.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-class Relu3Op : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim("X");
-    ctx->SetOutputDim("Y", in_dims);
-  }
-};
-
-class Relu3OpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "The input tensor.");
-    AddOutput("Y", "Output of relu_op");
-    AddComment(R"DOC(
-Relu3 Operator.
-)DOC");
-  }
-};
-
-class Relu3GradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
-  }
-};
-
-template <typename T>
-class Relu3GradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("relu3_grad");
-    op->SetInput("Y", this->Output("Y"));
-    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
-    op->SetAttrMap(this->Attrs());
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-  }
-};
-
-using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
-class Relu3Kernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < in_t->numel(); ++i) {
-      y[i] = std::max(static_cast<T>(0.), x[i]);
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class Relu3GradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    for (int i = 0; i < y_t->numel(); ++i) {
-      dx[i] = dy[i] * (y[i] > static_cast<T>(0) ? 1. : 0.);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-using CPU = paddle::platform::CPUDeviceContext;
-REGISTER_OPERATOR(relu3,
-                  ops::Relu3Op,
-                  ops::Relu3OpMaker,
-                  ops::Relu3GradMaker<paddle::framework::OpDesc>,
-                  ops::Relu3GradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(relu3_grad, ops::Relu3GradOp);
-REGISTER_OP_CPU_KERNEL(relu3,
-                       ops::Relu3Kernel<CPU, float>,
-                       ops::Relu3Kernel<CPU, double>);
-REGISTER_OP_CPU_KERNEL(relu3_grad,
-                       ops::Relu3GradKernel<CPU, float>,
-                       ops::Relu3GradKernel<CPU, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cu b/python/paddle/fluid/tests/custom_op/relu_op3.cu
deleted file mode 100644
index 8a229cafebb1d028059af9f63f28ded8117b1d12..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/custom_op/relu_op3.cu
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-__global__ void KeRelu3(const T* x, const int num, T* y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    y[i] = max(x[i], static_cast<T>(0.));
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu3CUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_t = ctx.Input<Tensor>("X");
-    auto* out_t = ctx.Output<Tensor>("Y");
-    auto x = in_t->data<T>();
-    auto y = out_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = in_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu3<T><<<grid, block, 0, dev_ctx.stream()>>>(x, num, y);
-  }
-};
-
-template <typename T>
-__global__ void KeRelu3Grad(const T* y, const T* dy, const int num, T* dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
-    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
-  }
-}
-
-template <typename DeviceContext, typename T>
-class Relu3GradCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    auto* y_t = ctx.Input<Tensor>("Y");
-    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto dy = dy_t->data<T>();
-    auto y = y_t->data<T>();
-    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    int num = dy_t->numel();
-    int block = 512;
-    int grid = (num + block - 1) / block;
-    KeRelu3Grad<T><<<grid, block, 0, dev_ctx.stream()>>>(y, dy, num, dx);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(relu3,
-                        paddle::operators::Relu3CUDAKernel<CUDA, float>,
-                        paddle::operators::Relu3CUDAKernel<CUDA, double>);
-
-REGISTER_OP_CUDA_KERNEL(relu3_grad,
-                        paddle::operators::Relu3GradCUDAKernel<CUDA, float>,
-                        paddle::operators::Relu3GradCUDAKernel<CUDA, double>);
diff --git a/python/paddle/fluid/tests/custom_op/setup_build.py b/python/paddle/fluid/tests/custom_op/setup_build.py
deleted file mode 100644
index 16a747793079e37dca6cc133093fb5ae7d824d73..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/custom_op/setup_build.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-
-from utils import paddle_includes, extra_compile_args
-from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-file_dir = os.path.dirname(os.path.abspath(__file__))
-
-setup(
-    name='librelu2_op_from_setup',
-    ext_modules=[
-        CUDAExtension(
-            sources=['relu_op3.cc', 'relu_op3.cu', 'relu_op.cc',
-                     'relu_op.cu'],  # test for multi ops
-            include_dirs=paddle_includes,
-            extra_compile_args=extra_compile_args)
-    ],
-    cmdclass={
-        'build_ext': BuildExtension.with_options(
-            no_python_abi_suffix=True, output_dir=file_dir)  # for unittest
-    })
diff --git a/python/paddle/fluid/tests/custom_op/test_check_abi.py b/python/paddle/fluid/tests/custom_op/test_check_abi.py
index ed2af83b2342bbdf79c75ebd982996d5af8c3072..baef25d2d1162dc4ee3ef7a7517b02b2e85595a7 100644
--- a/python/paddle/fluid/tests/custom_op/test_check_abi.py
+++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -52,6 +52,8 @@ class TestCheckCompiler(TestABIBase):
             compiler = 'g++'
         elif utils.IS_WINDOWS:
             compiler = 'cl'
+        else:
+            compiler = 'clang'
 
         # Linux: all CI gcc version > 5.4.0
         # Windows: all CI MSVC version > 19.00.24215
@@ -62,16 +64,31 @@ class TestCheckCompiler(TestABIBase):
         # clear environ
         self.del_environ()
         compiler = 'python'  # fake wrong compiler
-        with warnings.catch_warnings(record=True) as error:
-            flag = utils.check_abi_compatibility(compiler, verbose=True)
-            # check return False
-            self.assertFalse(flag)
-            # check Compiler Compatibility WARNING
-            self.assertTrue(len(error) == 1)
-            self.assertTrue(
-                "Compiler Compatibility WARNING" in str(error[0].message))
+        if not utils.IS_WINDOWS:
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return False
+                self.assertFalse(flag)
+                # check Compiler Compatibility WARNING
+                self.assertTrue(len(error) == 1)
+                self.assertTrue(
+                    "Compiler Compatibility WARNING" in str(error[0].message))
 
-    def test_exception(self):
+    def test_exception_windows(self):
+        # clear environ
+        self.del_environ()
+        compiler = 'fake compiler'  # fake command
+        if utils.IS_WINDOWS:
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return False
+                self.assertFalse(flag)
+                # check ABI Compatibility WARNING
+                self.assertTrue(len(error) == 1)
+                self.assertTrue("Failed to check compiler version for" in
+                                str(error[0].message))
+
+    def test_exception_linux(self):
         # clear environ
         self.del_environ()
         compiler = 'python'  # fake command
@@ -95,6 +112,28 @@ class TestCheckCompiler(TestABIBase):
             # restore
             utils._expected_compiler_current_platform = raw_func
 
+    def test_exception_mac(self):
+        # clear environ
+        self.del_environ()
+        compiler = 'python'  # fake command
+        if utils.OS_NAME.startswith('darwin'):
+
+            def fake():
+                return [compiler]
+
+            # mock a fake function
+            raw_func = utils._expected_compiler_current_platform
+            utils._expected_compiler_current_platform = fake
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return True
+                self.assertTrue(flag)
+                # check ABI Compatibility without WARNING
+                self.assertTrue(len(error) == 0)
+
+            # restore
+            utils._expected_compiler_current_platform = raw_func
+
 
 class TestRunCMDException(unittest.TestCase):
     def test_exception(self):
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
index a6278e3ffc3515bbfceb70981857f798fb068262..1c9c6eedbaeb8c1c3f06d42d82a8ec5cc28750f6 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py
@@ -40,24 +40,38 @@ custom_attrs = load(
 
 
 class TestJitCustomAttrs(unittest.TestCase):
-    def test_attr_value(self):
+    def setUp(self):
         paddle.set_device('cpu')
         # prepare test value
-        bool_attr = True
-        int_attr = 10
-        float_attr = 3.14
-        int64_attr = 10000000000
-        str_attr = "StrAttr"
-        int_vec_attr = [10, 10, 10]
-        float_vec_attr = [3.14, 3.14, 3.14]
-        int64_vec_attr = [10000000000, 10000000000, 10000000000]
-        str_vec_attr = ["StrAttr", "StrAttr", "StrAttr"]
+        self.bool_attr = True
+        self.int_attr = 10
+        self.float_attr = 3.14
+        self.int64_attr = 10000000000
+        self.str_attr = "StrAttr"
+        self.int_vec_attr = [10, 10, 10]
+        self.float_vec_attr = [3.14, 3.14, 3.14]
+        self.int64_vec_attr = [10000000000, 10000000000, 10000000000]
+        self.str_vec_attr = ["StrAttr", "StrAttr", "StrAttr"]
 
+    def test_attr_value(self):
         x = paddle.ones([2, 2], dtype='float32')
         x.stop_gradient = False
         out = custom_attrs.attr_test(
-            x, bool_attr, int_attr, float_attr, int64_attr, str_attr,
-            int_vec_attr, float_vec_attr, int64_vec_attr, str_vec_attr)
+            x, self.bool_attr, self.int_attr, self.float_attr, self.int64_attr,
+            self.str_attr, self.int_vec_attr, self.float_vec_attr,
+            self.int64_vec_attr, self.str_vec_attr)
+        out.stop_gradient = False
+        out.backward()
+
+        self.assertTrue(np.array_equal(x.numpy(), out.numpy()))
+
+    def test_const_attr_value(self):
+        x = paddle.ones([2, 2], dtype='float32')
+        x.stop_gradient = False
+        out = custom_attrs.const_attr_test(
+            x, self.bool_attr, self.int_attr, self.float_attr, self.int64_attr,
+            self.str_attr, self.int_vec_attr, self.float_vec_attr,
+            self.int64_vec_attr, self.str_vec_attr)
         out.stop_gradient = False
         out.backward()
 
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..d796c3b5fbd60b450d7dbb02cbd47905c57f4b98
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+
+import paddle
+import paddle.static as static
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+
+# Because Windows don't use docker, the shared lib already exists in the
+# cache dir, it will not be compiled again unless the shared lib is removed.
+file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
+    get_build_directory())
+if os.name == 'nt' and os.path.isfile(file):
+    cmd = 'del {}'.format(file)
+    run_cmd(cmd, True)
+
+if os.name == 'nt':
+    test_include = "..\\python\\paddle\\fluid\\tests\\custom_op"
+else:
+    test_include = "../python/paddle/fluid/tests/custom_op"
+paddle_includes.append(test_include)
+
+custom_ops = load(
+    name='custom_concat_jit',
+    sources=['custom_concat_op.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cxx_cflags=extra_cc_args,  # test for cc flags
+    extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
+    verbose=True)
+
+
+def concat_dynamic(func, dtype, np_inputs, axis_v, with_attr=False):
+    paddle.set_device("cpu")
+    inputs = [
+        paddle.to_tensor(
+            x, dtype=dtype, stop_gradient=False) for x in np_inputs
+    ]
+    if with_attr:
+        axis = axis_v
+    else:
+        axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v)
+    out = func(inputs, axis)
+    out.stop_gradient = False
+    out.backward()
+    grad_inputs = [x.grad.numpy() for x in inputs]
+    return out.numpy(), grad_inputs
+
+
+def concat_static(func, dtype, np_inputs, axis_v, with_attr=False):
+    paddle.enable_static()
+    paddle.set_device("cpu")
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x1 = static.data(name="x1", shape=[2, 3], dtype=dtype)
+            x2 = static.data(name="x2", shape=[2, 3], dtype=dtype)
+            if with_attr:
+                axis = axis_v
+            else:
+                axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v)
+            x1.stop_gradient = False
+            x2.stop_gradient = False
+            out = func([x1, x2], axis)
+            # mean only support float, so here use sum
+            sum_out = paddle.sum(out)
+            static.append_backward(sum_out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            if with_attr:
+                feed_dict = {
+                    "x1": np_inputs[0].astype(dtype),
+                    "x2": np_inputs[1].astype(dtype)
+                }
+            else:
+                feed_dict = {
+                    "x1": np_inputs[0].astype(dtype),
+                    "x2": np_inputs[1].astype(dtype),
+                    "axis": axis
+                }
+            out_v, x1_grad_v, x2_grad_v = exe.run(
+                static.default_main_program(),
+                feed=feed_dict,
+                fetch_list=[out.name, x1.name + "@GRAD", x2.name + "@GRAD"])
+    paddle.disable_static()
+    return out_v, x1_grad_v, x2_grad_v
+
+
+class TestCustomConcatDynamicAxisJit(unittest.TestCase):
+    def setUp(self):
+        self.dtypes = ['float32', 'float64', 'int32', 'int64']
+        self.np_inputs = [
+            np.array([[1, 2, 3], [4, 5, 6]]),
+            np.array([[11, 12, 13], [14, 15, 16]])
+        ]
+        self.axises = [0, 1]
+
+    def check_output(self, out, pd_out, name):
+        self.assertTrue(
+            np.array_equal(out, pd_out),
+            "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
+                                                           pd_out))
+
+    def test_dynamic(self):
+        for dtype in self.dtypes:
+            for axis in self.axises:
+                out, grad_inputs = concat_dynamic(custom_ops.custom_concat,
+                                                  dtype, self.np_inputs, axis)
+                pd_out, pd_grad_inputs = concat_dynamic(paddle.concat, dtype,
+                                                        self.np_inputs, axis)
+
+                self.check_output(out, pd_out, "out")
+                for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
+                    self.check_output(x_grad, pd_x_grad, "x_grad")
+
+    def test_static(self):
+        for dtype in self.dtypes:
+            for axis in self.axises:
+                out, x1_grad, x2_grad = concat_static(
+                    custom_ops.custom_concat, dtype, self.np_inputs, axis)
+                pd_out, pd_x1_grad, pd_x2_grad = concat_static(
+                    paddle.concat, dtype, self.np_inputs, axis)
+
+                self.check_output(out, pd_out, "out")
+                self.check_output(x1_grad, pd_x1_grad, "x1_grad")
+                self.check_output(x2_grad, pd_x2_grad, "x2_grad")
+
+    def test_dynamic_with_attr(self):
+        for dtype in self.dtypes:
+            for axis in self.axises:
+                out, grad_inputs = concat_dynamic(
+                    custom_ops.custom_concat_with_attr, dtype, self.np_inputs,
+                    axis, True)
+                pd_out, pd_grad_inputs = concat_dynamic(
+                    paddle.concat, dtype, self.np_inputs, axis, True)
+
+                self.check_output(out, pd_out, "out")
+                for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs):
+                    self.check_output(x_grad, pd_x_grad, "x_grad")
+
+    def test_static_with_attr(self):
+        for dtype in self.dtypes:
+            for axis in self.axises:
+                out, x1_grad, x2_grad = concat_static(
+                    custom_ops.custom_concat_with_attr, dtype, self.np_inputs,
+                    axis, True)
+                pd_out, pd_x1_grad, pd_x2_grad = concat_static(
+                    paddle.concat, dtype, self.np_inputs, axis, True)
+
+                self.check_output(out, pd_out, "out")
+                self.check_output(x1_grad, pd_x1_grad, "x1_grad")
+                self.check_output(x2_grad, pd_x2_grad, "x2_grad")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8e4019880332d46031ae3847a0a3f611ed37ad2
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+
+import paddle
+import paddle.static as static
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+
+# Because Windows don't use docker, the shared lib already exists in the
+# cache dir, it will not be compiled again unless the shared lib is removed.
+file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
+    get_build_directory())
+if os.name == 'nt' and os.path.isfile(file):
+    cmd = 'del {}'.format(file)
+    run_cmd(cmd, True)
+
+custom_ops = load(
+    name='custom_conj_jit',
+    sources=['custom_conj_op.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cxx_cflags=extra_cc_args,  # test for cc flags
+    extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
+    verbose=True)
+
+
+def is_complex(dtype):
+    return dtype == paddle.fluid.core.VarDesc.VarType.COMPLEX64 or \
+      dtype == paddle.fluid.core.VarDesc.VarType.COMPLEX128
+
+
+def to_complex(dtype):
+    if dtype == "float32":
+        return np.complex64
+    elif dtype == "float64":
+        return np.complex128
+    else:
+        return dtype
+
+
+def conj_dynamic(func, dtype, np_input):
+    paddle.set_device("cpu")
+    x = paddle.to_tensor(np_input)
+    out = func(x)
+    out.stop_gradient = False
+    sum_out = paddle.sum(out)
+    if is_complex(sum_out.dtype):
+        sum_out.real().backward()
+    else:
+        sum_out.backward()
+    if x.grad is None:
+        return out.numpy(), x.grad
+    else:
+        return out.numpy(), x.grad.numpy()
+
+
+def conj_static(func, shape, dtype, np_input):
+    paddle.enable_static()
+    paddle.set_device("cpu")
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name="x", shape=shape, dtype=dtype)
+            x.stop_gradient = False
+            out = func(x)
+            sum_out = paddle.sum(out)
+            static.append_backward(sum_out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            out_v, x_grad_v = exe.run(static.default_main_program(),
+                                      feed={"x": np_input},
+                                      fetch_list=[out.name, x.name + "@GRAD"])
+    paddle.disable_static()
+    return out_v, x_grad_v
+
+
+class TestCustomConjJit(unittest.TestCase):
+    def setUp(self):
+        self.dtypes = ['float32', 'float64']
+        self.shape = [2, 20, 2, 3]
+
+    def check_output(self, out, pd_out, name):
+        self.assertTrue(
+            np.array_equal(out, pd_out),
+            "custom op {}: {},\n paddle api {}: {}".format(name, out, name,
+                                                           pd_out))
+
+    def run_dynamic(self, dtype, np_input):
+        out, x_grad = conj_dynamic(custom_ops.custom_conj, dtype, np_input)
+        pd_out, pd_x_grad = conj_dynamic(paddle.conj, dtype, np_input)
+
+        self.check_output(out, pd_out, "out")
+        self.check_output(x_grad, pd_x_grad, "x's grad")
+
+    def run_static(self, dtype, np_input):
+        out, x_grad = conj_static(custom_ops.custom_conj, self.shape, dtype,
+                                  np_input)
+        pd_out, pd_x_grad = conj_static(paddle.conj, self.shape, dtype,
+                                        np_input)
+
+        self.check_output(out, pd_out, "out")
+        self.check_output(x_grad, pd_x_grad, "x's grad")
+
+    def test_dynamic(self):
+        for dtype in self.dtypes:
+            np_input = np.random.random(self.shape).astype(dtype)
+            self.run_dynamic(dtype, np_input)
+
+    def test_static(self):
+        for dtype in self.dtypes:
+            np_input = np.random.random(self.shape).astype(dtype)
+            self.run_static(dtype, np_input)
+
+    # complex only used in dynamic mode now
+    def test_complex_dynamic(self):
+        for dtype in self.dtypes:
+            np_input = np.random.random(self.shape).astype(
+                dtype) + 1j * np.random.random(self.shape).astype(dtype)
+            self.run_dynamic(to_complex(dtype), np_input)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_op.py b/python/paddle/fluid/tests/custom_op/test_custom_op.py
deleted file mode 100644
index 1c0db0be154d585e0a6d1514558e116ee83c9701..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/custom_op/test_custom_op.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import numpy as np
-import unittest
-import contextlib
-
-import paddle
-import paddle.fluid as fluid
-paddle.enable_static()
-
-
-def load_so(so_name):
-    """
-    Load .so file and parse custom op into OpInfoMap.
-    """
-    file_dir = os.path.dirname(os.path.abspath(__file__))
-    fluid.load_op_library(os.path.join(file_dir, so_name))
-
-
-from paddle.fluid.layer_helper import LayerHelper
-
-
-def relu2(x, name=None):
-    helper = LayerHelper("relu2", **locals())
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
-    helper.append_op(type="relu2", inputs={"X": x}, outputs={"Y": out})
-    return out
-
-
-@contextlib.contextmanager
-def scope_prog_guard():
-    prog = fluid.Program()
-    startup_prog = fluid.Program()
-    scope = fluid.core.Scope()
-    with fluid.scope_guard(scope):
-        with fluid.program_guard(prog, startup_prog):
-            yield
-
-
-def linear_fc(data, label, use_custom_relu):
-    hidden = fluid.layers.fc(data, size=128)
-    hidden = relu2(hidden) if use_custom_relu else fluid.layers.relu(hidden)
-    hidden = fluid.layers.fc(hidden, size=128)
-    hidden = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=hidden, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def custom_op_test(use_gpu=True, use_custom_relu=True):
-    with scope_prog_guard():
-        np.random.seed(0)
-        fluid.default_startup_program().random_seed = 10
-        fluid.default_main_program().random_seed = 10
-
-        data = fluid.layers.data(
-            name='data', shape=[1, 28, 28], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-        loss = linear_fc(data, label, use_custom_relu)
-
-        optimizer = fluid.optimizer.Momentum(learning_rate=0.1, momentum=0.9)
-        optimizer.minimize(loss)
-
-        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-
-        compile_program = fluid.compiler.CompiledProgram(
-            fluid.default_main_program()).with_data_parallel(
-                loss_name=loss.name)
-
-        reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=32)
-        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
-
-        num = 4
-        for i, data in enumerate(reader()):
-            outs, = exe.run(compile_program,
-                            feed=feeder.feed(data),
-                            fetch_list=[loss])
-            if i == num:
-                break
-        return outs
-
-
-class CustomOpTest(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(2)
-
-    def test_cpu(self):
-        actual = custom_op_test(False, True)
-        expect = custom_op_test(False, False)
-        self.assertEqual(actual.all(), expect.all())
-
-    def test_gpu(self):
-        if not fluid.core.is_compiled_with_cuda():
-            return
-        actual = custom_op_test(True, True)
-        expect = custom_op_test(True, False)
-        self.assertEqual(actual.all(), expect.all())
-
-
-if __name__ == '__main__':
-    load_so(so_name='librelu2_op.so')
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
index 1d4b2ae161eda6bf11200db3565152d4ba8edf1e..dddb14eb78e8a1dc6a0820ead2ebfa915b8a09c2 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,9 +21,9 @@ from paddle import nn
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 
-from utils import paddle_includes, extra_cc_args, extra_nvcc_args
+from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_MAC
 
-# Because Windows don't use docker, the shared lib already exists in the 
+# Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\custom_relu_for_model_jit\\custom_relu_for_model_jit.pyd'.format(
     get_build_directory())
@@ -35,9 +35,13 @@ if os.name == 'nt' and os.path.isfile(file):
 # custom_relu_op_dup.cc is only used for multi ops test,
 # not a new op, if you want to test only one op, remove this
 # source file
+source_files = ['custom_relu_op.cc']
+if not IS_MAC:
+    source_files.append('custom_relu_op.cu')
+
 custom_module = load(
     name='custom_relu_for_model_jit',
-    sources=['custom_relu_op.cc', 'custom_relu_op.cu'],
+    sources=source_files,
     extra_include_paths=paddle_includes,  # add for Coverage CI
     extra_cxx_cflags=extra_cc_args,  # test for cc flags
     extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
@@ -84,7 +88,7 @@ class TestDygraphModel(unittest.TestCase):
             for i in range(self.batch_num)
         ]
 
-        self.devices = ['cpu', 'gpu']
+        self.devices = ['cpu', 'gpu'] if not IS_MAC else ['cpu']
 
         # for saving model
         self.model_path_template = "infer_model/custom_relu_dygaph_model_{}.pdparams"
@@ -191,7 +195,7 @@ class TestStaticModel(unittest.TestCase):
             for i in range(self.batch_num)
         ]
 
-        self.devices = ['cpu', 'gpu']
+        self.devices = ['cpu', 'gpu'] if not IS_MAC else ['cpu']
 
         # for saving model
         self.model_path_template = "infer_model/custom_relu_static_model_{}_{}"
@@ -243,7 +247,8 @@ class TestStaticModel(unittest.TestCase):
         paddle.set_device(device)
 
         with paddle.static.scope_guard(paddle.static.Scope()):
-            with paddle.static.program_guard(paddle.static.Program()):
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
                 x = paddle.static.data(
                     shape=[None, self.in_dim], name='x', dtype='float32')
                 y = paddle.static.data(
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index 34cf38aacfa7303aebacfe7907b6b900f547431e..0f7ba84ffc147b75a5dbc29988263e3ff31b2d4c 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -13,16 +13,15 @@
 # limitations under the License.
 
 import os
-import subprocess
 import unittest
 import paddle
 import numpy as np
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
-from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS
+from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS, IS_MAC
 from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static
 
-# Because Windows don't use docker, the shared lib already exists in the 
+# Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
 file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format(
     get_build_directory())
@@ -34,11 +33,13 @@ if os.name == 'nt' and os.path.isfile(file):
 # custom_relu_op_dup.cc is only used for multi ops test,
 # not a new op, if you want to test only one op, remove this
 # source file
+sources = ['custom_relu_op.cc', 'custom_relu_op_dup.cc']
+if not IS_MAC:
+    sources.append('custom_relu_op.cu')
+
 custom_module = load(
     name='custom_relu_module_jit',
-    sources=[
-        'custom_relu_op.cc', 'custom_relu_op.cu', 'custom_relu_op_dup.cc'
-    ],
+    sources=sources,
     extra_include_paths=paddle_includes,  # add for Coverage CI
     extra_cxx_cflags=extra_cc_args,  # test for cc flags
     extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
@@ -51,11 +52,17 @@ class TestJITLoad(unittest.TestCase):
             custom_module.custom_relu, custom_module.custom_relu_dup
         ]
         self.dtypes = ['float32', 'float64']
-        self.devices = ['cpu', 'gpu']
+        if paddle.is_compiled_with_cuda():
+            self.dtypes.append('float16')
+        self.devices = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            self.devices.append('gpu')
 
     def test_static(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out = custom_relu_static(custom_op, device, dtype, x)
@@ -69,6 +76,8 @@ class TestJITLoad(unittest.TestCase):
     def test_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
@@ -88,7 +97,7 @@ class TestJITLoad(unittest.TestCase):
         caught_exception = False
         try:
             x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
-            custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'float32', x)
+            custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'int32', x)
         except OSError as e:
             caught_exception = True
             self.assertTrue(
@@ -96,25 +105,28 @@ class TestJITLoad(unittest.TestCase):
                 in str(e))
             if IS_WINDOWS:
                 self.assertTrue(
-                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:48"
-                    in str(e))
+                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc" in
+                    str(e))
             else:
                 self.assertTrue(
-                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:48"
-                    in str(e))
+                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in
+                    str(e))
         self.assertTrue(caught_exception)
 
         caught_exception = False
+        # MAC-CI don't support GPU
+        if IS_MAC:
+            return
         try:
-            x = np.random.uniform(-1, 1, [4, 8]).astype('int64')
-            custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'float32', x)
+            x = np.random.uniform(-1, 1, [4, 8]).astype('int32')
+            custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x)
         except OSError as e:
             caught_exception = True
             self.assertTrue(
-                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int64_t`"
+                "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32_t`"
                 in str(e))
             self.assertTrue(
-                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:49" in
+                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu" in
                 str(e))
         self.assertTrue(caught_exception)
 
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 6781915e021c92f4c0f6a25e9f42ab940a3035d2..0af0aa16466ea82eeb4a9558bdbcb3de69489bb4 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,7 +26,7 @@ from paddle.utils.cpp_extension.extension_utils import run_cmd
 def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
     paddle.set_device(device)
 
-    t = paddle.to_tensor(np_x)
+    t = paddle.to_tensor(np_x, dtype=dtype)
     t.stop_gradient = False
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
@@ -34,7 +34,10 @@ def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
 
     out.backward()
 
-    return out.numpy(), t.grad
+    if t.grad is None:
+        return out.numpy(), t.grad
+    else:
+        return out.numpy(), t.grad.numpy()
 
 
 def custom_relu_static(func,
@@ -142,7 +145,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
             cmd = 'cd /d {} && python custom_relu_setup.py install'.format(
                 cur_dir)
         else:
-            cmd = 'cd {} && python custom_relu_setup.py install'.format(cur_dir)
+            cmd = 'cd {} && {} custom_relu_setup.py install'.format(
+                cur_dir, sys.executable)
         run_cmd(cmd)
 
         # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@@ -171,7 +175,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
         ]
 
         self.dtypes = ['float32', 'float64']
-        self.devices = ['cpu', 'gpu']
+        if paddle.is_compiled_with_cuda():
+            self.dtypes.append('float16')
+        self.devices = ['cpu']
+        if paddle.is_compiled_with_cuda():
+            self.devices.append('gpu')
 
         # config seed
         SEED = 2021
@@ -181,6 +189,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
     def test_static(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out = custom_relu_static(custom_op, device, dtype, x)
@@ -194,6 +204,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
     def test_static_pe(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out = custom_relu_static_pe(custom_op, device, dtype, x)
@@ -207,6 +219,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
     def test_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
+                if device == 'cpu' and dtype == 'float16':
+                    continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
@@ -244,6 +258,35 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
                     format(predict, predict_infer))
         paddle.disable_static()
 
+    def test_static_save_and_run_inference_predictor(self):
+        paddle.enable_static()
+        np_data = np.random.random((1, 1, 28, 28)).astype("float32")
+        np_label = np.random.random((1, 1)).astype("int64")
+        path_prefix = "custom_op_inference/custom_relu"
+        from paddle.inference import Config
+        from paddle.inference import create_predictor
+        for device in self.devices:
+            predict = custom_relu_static_inference(
+                self.custom_ops[0], device, np_data, np_label, path_prefix)
+            # load inference model
+            config = Config(path_prefix + ".pdmodel",
+                            path_prefix + ".pdiparams")
+            predictor = create_predictor(config)
+            input_tensor = predictor.get_input_handle(predictor.get_input_names(
+            )[0])
+            input_tensor.reshape(np_data.shape)
+            input_tensor.copy_from_cpu(np_data.copy())
+            predictor.run()
+            output_tensor = predictor.get_output_handle(
+                predictor.get_output_names()[0])
+            predict_infer = output_tensor.copy_to_cpu()
+            self.assertTrue(
+                np.isclose(
+                    predict, predict_infer, rtol=5e-5).any(),
+                "custom op predict: {},\n custom op infer predict: {}".format(
+                    predict, predict_infer))
+        paddle.disable_static()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
index 6cdbc61620d217fc42582640f4bd645e9dbb3a62..12e9f50a5e4092a067c533bcdb6bcb03011d35fa 100644
--- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py
@@ -55,6 +55,11 @@ class TestJitDispatch(unittest.TestCase):
         for dtype in dtypes:
             self.run_dispatch_test(dispatch_op.dispatch_test_integer, dtype)
 
+    def test_dispatch_complex(self):
+        dtypes = ["complex64", "complex128"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_complex, dtype)
+
     def test_dispatch_float_and_integer(self):
         dtypes = [
             "float32", "float64", "int32", "int64", "int8", "uint8", "int16"
@@ -63,6 +68,27 @@ class TestJitDispatch(unittest.TestCase):
             self.run_dispatch_test(dispatch_op.dispatch_test_float_and_integer,
                                    dtype)
 
+    def test_dispatch_float_and_complex(self):
+        dtypes = ["float32", "float64", "complex64", "complex128"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_complex,
+                                   dtype)
+
+    def test_dispatch_float_and_integer_and_complex(self):
+        dtypes = [
+            "float32", "float64", "int32", "int64", "int8", "uint8", "int16",
+            "complex64", "complex128"
+        ]
+        for dtype in dtypes:
+            self.run_dispatch_test(
+                dispatch_op.dispatch_test_float_and_integer_and_complex, dtype)
+
+    def test_dispatch_float_and_half(self):
+        dtypes = ["float32", "float64", "float16"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_half,
+                                   dtype)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_jit_load.py b/python/paddle/fluid/tests/custom_op/test_jit_load.py
deleted file mode 100644
index 4e6d74b7d60997d9d979db63859fa0b070846864..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/custom_op/test_jit_load.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import paddle
-import numpy as np
-from paddle.utils.cpp_extension import load
-from utils import paddle_includes, extra_cc_args, extra_nvcc_args
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-# Compile and load custom op Just-In-Time.
-custom_module = load(
-    name='custom_relu2',
-    sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc', 'relu_op3.cu'],
-    extra_include_paths=paddle_includes,  # add for Coverage CI
-    extra_cxx_cflags=extra_cc_args,  # test for cc flags
-    extra_cuda_cflags=extra_nvcc_args,  # test for nvcc flags
-    verbose=True  # add for unittest
-)
-
-
-class TestJITLoad(unittest.TestCase):
-    def test_api(self):
-        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
-        gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')
-        x = paddle.to_tensor(raw_data, dtype='float32')
-        # use custom api
-        out = custom_module.relu2(x)
-        out3 = custom_module.relu3(x)
-
-        self.assertTrue(np.array_equal(out.numpy(), gt_data))
-        self.assertTrue(np.array_equal(out3.numpy(), gt_data))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_setup_build.py b/python/paddle/fluid/tests/custom_op/test_setup_build.py
deleted file mode 100644
index 1ef14c2e3aaa3c9412660b7afcc14e22aa6402ea..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/custom_op/test_setup_build.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-import numpy as np
-from test_custom_op import CustomOpTest, load_so
-import paddle
-from paddle.utils.cpp_extension.extension_utils import run_cmd
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-
-def compile_so():
-    """
-    Compile .so file by running setup.py config.
-    """
-    # build .so with setup.py
-    file_dir = os.path.dirname(os.path.abspath(__file__))
-    cmd = 'cd {} && python setup_build.py build'.format(file_dir)
-    run_cmd(cmd)
-
-
-# `setup.py build` only produce .so file containing multi operators.
-#  Python Interface should be added manually. `relu2` api is in `test_custom_op.py`
-def relu3(x, name=None):
-    helper = LayerHelper("relu3", **locals())
-    out = helper.create_variable(
-        type=x.type, name=name, dtype=x.dtype, persistable=False)
-    helper.append_op(type="relu3", inputs={"X": x}, outputs={"Y": out})
-    return out
-
-
-class TestCompileMultiOp(unittest.TestCase):
-    def setUp(self):
-        paddle.disable_static()
-
-    def test_relu3(self):
-        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
-        x = paddle.to_tensor(raw_data, dtype='float32')
-        # use custom api
-        out = relu3(x)
-
-        self.assertTrue(
-            np.array_equal(out.numpy(),
-                           np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')))
-
-    def tearDown(self):
-        paddle.enable_static()
-
-
-if __name__ == '__main__':
-    compile_so()
-    load_so(so_name='librelu2_op_from_setup.so')
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_setup_install.py b/python/paddle/fluid/tests/custom_op/test_setup_install.py
deleted file mode 100644
index 1fd7b8a06f952336274a78b253d096644fbdf08f..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/custom_op/test_setup_install.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import site
-import unittest
-import paddle
-import subprocess
-import numpy as np
-from paddle.utils.cpp_extension.extension_utils import run_cmd
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
-
-# switch to old custom op method
-use_new_custom_op_load_method(False)
-
-
-class TestSetUpInstall(unittest.TestCase):
-    def setUp(self):
-        cur_dir = os.path.dirname(os.path.abspath(__file__))
-        # compile, install the custom op egg into site-packages under background
-        cmd = 'cd {} && python setup_install.py install'.format(cur_dir)
-        run_cmd(cmd)
-
-        # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
-        # But we simulate to pip install in current process, so interpreter don't snap
-        # sys.path has been updated. So we update it manually.
-
-        # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3
-        site_dir = site.getsitepackages()[0]
-        custom_egg_path = [
-            x for x in os.listdir(site_dir) if 'custom_relu2' in x
-        ]
-        assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
-            custom_egg_path)
-        sys.path.append(os.path.join(site_dir, custom_egg_path[0]))
-
-    def test_api(self):
-        # usage: import the package directly
-        import custom_relu2
-
-        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
-        gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')
-        x = paddle.to_tensor(raw_data, dtype='float32')
-        # use custom api
-        out = custom_relu2.relu2(x)
-        out3 = custom_relu2.relu3(x)
-
-        self.assertTrue(np.array_equal(out.numpy(), gt_data))
-        self.assertTrue(np.array_equal(out3.numpy(), gt_data))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/utils.py b/python/paddle/fluid/tests/custom_op/utils.py
index 57ce79b1f3055303213a5b7d766b73b553c6eae6..2d492da3d9725fd68aa3b7273405ecc67919c33c 100644
--- a/python/paddle/fluid/tests/custom_op/utils.py
+++ b/python/paddle/fluid/tests/custom_op/utils.py
@@ -13,10 +13,13 @@
 # limitations under the License.
 
 import os
+import sys
 import six
 from distutils.sysconfig import get_python_lib
 from paddle.utils.cpp_extension.extension_utils import IS_WINDOWS
 
+IS_MAC = sys.platform.startswith('darwin')
+
 site_packages_path = get_python_lib()
 # Note(Aurelius84): We use `add_test` in Cmake to config how to run unittest in CI.
 # `PYTHONPATH` will be set as `build/python/paddle` that will make no way to find
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index b5c554a58cbbd2799dfaf9e434c5937012e5d2e8..110665186c0e2bded12274f8a3883bb8c028dd10 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -11,14 +11,20 @@ endif()
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_pipeline)
+list(APPEND DIST_TEST_OPS test_static_model_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
+list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_layer)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -26,9 +32,9 @@ list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
 list(APPEND MIXED_DIST_TEST_OPS test_recv_save_op)
-list(APPEND MIXED_DIST_TEST_OPS test_transpiler_ops)
 list(APPEND MIXED_DIST_TEST_OPS test_c_comm_init_op)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_async)
+list(APPEND MIXED_DIST_TEST_OPS test_communicator_ps_gpu)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_geo)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_half_async)
 list(APPEND MIXED_DIST_TEST_OPS test_communicator_sync)
@@ -38,6 +44,8 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleetrun)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend)
+list(APPEND MIXED_DIST_TEST_OPS test_ascend_group)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
@@ -46,6 +54,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
@@ -66,10 +75,14 @@ endforeach()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
+    LIST(REMOVE_ITEM TEST_OPS test_c_concat)
+    LIST(REMOVE_ITEM TEST_OPS test_c_split)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
+    LIST(REMOVE_ITEM TEST_OPS test_c_identity)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
     LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
+    LIST(REMOVE_ITEM TEST_OPS test_pipeline_parallel)
     LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
     LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
@@ -82,8 +95,11 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_alltoall_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_wait)
     LIST(REMOVE_ITEM TEST_OPS test_memcpy_op)
 endif()
@@ -148,6 +164,8 @@ if(APPLE OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
 endif()
 
+list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel)
+
 if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
     LIST(REMOVE_ITEM TEST_OPS test_rank_attention_op) # TODO(shenliang03): rank_attention_op support CPU device in future
@@ -159,8 +177,13 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_layer)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
+    LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
 elseif(WITH_GPU)
     if (${CUDNN_VERSION} VERSION_LESS 7100)
         LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -177,6 +200,7 @@ endif()
 
 if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
     list(REMOVE_ITEM TEST_OPS test_imperative_group)
+    LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
@@ -450,6 +474,7 @@ py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_stat
     FLAGS_cudnn_deterministic=1)
 py_test_modules(test_imperative_static_runner_while MODULES test_imperative_static_runner_while ENVS
     FLAGS_cudnn_deterministic=1)
+set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
     # FIXME(typhoonzero): add these tests back
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
@@ -474,8 +499,8 @@ if(WITH_DISTRIBUTE)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
 
     py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
-    py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
     py_test_modules(test_communicator_async MODULES test_communicator_async ENVS ${dist_ENVS})
+    py_test_modules(test_communicator_ps_gpu MODULES test_communicator_ps_gpu ENVS ${dist_ENVS})
     py_test_modules(test_communicator_geo MODULES test_communicator_geo ENVS ${dist_ENVS})
     py_test_modules(test_communicator_half_async MODULES test_communicator_half_async ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
     py_test_modules(test_communicator_sync MODULES test_communicator_sync ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
@@ -524,6 +549,10 @@ if(WITH_DISTRIBUTE)
         bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        if(WITH_ASCEND OR WITH_ASCEND_CL)
+            bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        endif()
 
         # port range (20000, 23000) is reserved for dist-ops
         set(dist_ut_port 20001)
@@ -534,7 +563,9 @@ if(WITH_DISTRIBUTE)
                 message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
             endif()
         endforeach(TEST_OP)
-        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        # solve it later.
+        # bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
     endif(NOT APPLE)
 endif()
 
@@ -610,6 +641,16 @@ if (WITH_XPU)
     add_subdirectory(xpu)
 endif()
 
+# dist xpu tests: 
+if (WITH_XPU_BKCL)
+    py_test(test_collective_reduce_api_xpu SRCS "test_collective_reduce_api.py")
+    py_test(test_collective_allreduce_api_xpu SRCS "test_collective_allreduce_api.py")
+endif()
+
+if (WITH_ASCEND_CL)
+    add_subdirectory(npu)
+endif()
+
 if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
@@ -713,12 +754,14 @@ set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bicubic_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250)
+set_tests_properties(test_pylayer_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
 if (WIN32)
     set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
     set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
 else()
     set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
-    set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
 endif()
 set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 120)
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
@@ -786,7 +829,7 @@ set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool2d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_transpose_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eager_deletion_gru_net PROPERTIES TIMEOUT 120)
-set_tests_properties(test_activation_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_activation_op PROPERTIES TIMEOUT 270)
 set_tests_properties(test_normal PROPERTIES TIMEOUT 120)
 set_tests_properties(test_lstmp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
@@ -820,26 +863,38 @@ set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
+    set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_pipeline_layer PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
     endif()
 endif()
 if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_alltoall_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_sendrecv_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
     if(WITH_DISTRIBUTE)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
     endif()
     set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
     set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_pipeline_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
     set_tests_properties(test_allreduce PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_concat PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_split PROPERTIES TIMEOUT 120)
     set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_identity PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_scatter PROPERTIES TIMEOUT 120)
@@ -851,9 +906,12 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         test_collective_scatter_api
         test_collective_barrier_api
         test_collective_reduce_api
+        test_pipeline_parallel
         test_collective_allreduce_api
+        test_new_group_api
         test_collective_broadcast_api
         test_collective_allgather_api
+        test_collective_alltoall_api
         PROPERTIES LABELS "RUN_TYPE=DIST")
 endif()
 if(WITH_GPU OR WITH_ROCM)
diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..851544e165980a8cf1ee750cd7b6a9417e00ed48
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ascend_group.py
@@ -0,0 +1,162 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import paddle.fluid as fluid
+from paddle.fluid import unique_name
+import paddle.fluid.core as core
+import paddle
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_optimizers.ascend import ascend_parser, ascend_optimizer
+from collections import namedtuple
+
+Block = namedtuple('Block', ['program'])
+Loss = namedtuple('Loss', ['block'])
+
+paddle.enable_static()
+
+OpRole = core.op_proto_and_checker_maker.OpRole
+OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
+OP_ROLE_VAR_KEY = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+
+role = fleet.PaddleCloudRoleMaker(is_collective=True)
+fleet.init(role)
+
+
+def init_communicator(startup_program, main_program, current_endpoint,
+                      endpoints, ring_id):
+    nranks = len(endpoints)
+    other_endpoints = endpoints[:]
+    other_endpoints.remove(current_endpoint)
+    group_rank = endpoints.index(current_endpoint)
+    assert group_rank >= 0
+
+    block = startup_program.global_block()
+    nccl_id_var = block.create_var(
+        name=unique_name.generate('nccl_id'),
+        persistable=True,
+        type=core.VarDesc.VarType.RAW)
+    block.append_op(
+        type='c_gen_nccl_id',
+        inputs={},
+        outputs={'Out': nccl_id_var},
+        attrs={
+            'rank': group_rank,
+            'endpoint': current_endpoint,
+            'other_endpoints': other_endpoints,
+            OP_ROLE_KEY: OpRole.Forward,
+        })
+    block.append_op(
+        type='c_comm_init',
+        inputs={'X': nccl_id_var},
+        outputs={},
+        attrs={
+            'nranks': nranks,
+            'rank': group_rank,
+            'ring_id': ring_id,
+            OP_ROLE_KEY: OpRole.Forward,
+        })
+
+    # add input op for test
+    fill_var_name = "tensor@Filled"
+    fill_var = block.create_var(
+        name=fill_var_name,
+        shape=[10, 10],
+        dtype='float32',
+        persistable=False,
+        stop_gradient=True)
+    block.append_op(
+        type="fill_constant",
+        outputs={"Out": fill_var_name},
+        attrs={
+            "shape": [10, 10],
+            "dtype": fill_var.dtype,
+            "value": 1.0,
+            "place_type": 1
+        })
+
+    with fluid.program_guard(main_program):
+        op_type = "c_allreduce_sum"
+        data = fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5)
+        helper = LayerHelper(op_type, **locals())
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [data]},
+            outputs={'Out': [data]},
+            attrs={'ring_id': ring_id,
+                   'use_calc_stream': True})
+
+    print("startup program:", startup_program)
+    print("main program:", main_program)
+
+
+def train(world_endpoints, world_device_ids, local_device_ids, local_rank):
+    startup_programs = []
+    main_programs = []
+
+    #trainer_endpoints=["127.0.0.1:6071","127.0.0.1:6072","127.0.0.1:6073","127.0.0.1:6074"]
+    trainer_endpoints = world_endpoints
+    groups = [[], [], []]
+    groups[0] = [trainer_endpoints[0], trainer_endpoints[1]]
+    groups[1] = [trainer_endpoints[2], trainer_endpoints[3]]
+    groups[2] = [trainer_endpoints[0], trainer_endpoints[2]]
+    print("groups:", groups)
+
+    for i in range(len(trainer_endpoints)):
+        startup_programs.append(fluid.Program())
+        main_programs.append(fluid.Program())
+
+    for idx, group in enumerate(groups):
+        for te in group:
+            te_idx = trainer_endpoints.index(te)
+            startup_program = startup_programs[te_idx]
+            main_program = main_programs[te_idx]
+            init_communicator(startup_program, main_program, te, group, idx)
+
+    print(len(startup_programs))
+    print(startup_programs[local_rank])
+    print(main_programs[local_rank])
+
+    print("local rank: ", local_rank)
+    print("local startup program: ", startup_programs[local_rank])
+
+    startup_program = startup_programs[local_rank]
+    main_program = main_programs[local_rank]
+    loss = Loss(Block(main_program))
+    optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[])
+    optimizer.minimize(
+        loss,
+        startup_program,
+        auto_dp=True,
+        rank_table_file=os.getenv("RANK_TABLE_FILE", None))
+
+    exe = paddle.static.Executor(paddle.CPUPlace())
+    exe.run(startup_program)
+    exe.run(main_program)
+
+
+worker_endpoints = fleet.worker_endpoints()
+world_device_ids = fleet.world_device_ids()
+local_device_ids = fleet.local_device_ids()
+local_rank = int(fleet.local_rank())
+
+print("worker_endpoints:", worker_endpoints)
+print("world_device_ids:", world_device_ids)
+print("local_device_ids:", local_device_ids)
+print("local_rank:", local_rank)
+
+train(worker_endpoints, world_device_ids, local_device_ids, local_rank)
diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb2180a733f818f81963e9702c854a78488a7592
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+
+
+def train(prefix):
+    selected_accelerators = os.getenv("FLAGS_selected_accelerators")
+    selected_npus = os.getenv("FLAGS_selected_npus")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+    device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
+    current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
+
+    details = "selected_accelerators:{} selected_npus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
+            .format(selected_accelerators, selected_npus, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
+
+    print(details)
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
+        f.write(details)
+
+
+if __name__ == '__main__':
+    prefix = sys.argv[1]
+    train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..597765cfb9811cc2361ba469e07f6a409a63d9c1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceNewGroupAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            gp = paddle.distributed.new_group([0, 1])
+            paddle.distributed.all_reduce(
+                tindata, group=gp, use_calc_stream=False)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceNewGroupAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective_alltoall_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..be18b68a1da3361892b158f5abcea33144f3e7af
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_alltoall_api.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = paddle.split(tindata, 2, axis=0)
+            tout_data = []
+            paddle.distributed.alltoall(tindata, tout_data)
+            return tout_data
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllToAllAPI, "alltoall")
diff --git a/python/paddle/fluid/tests/unittests/collective_concat_op.py b/python/paddle/fluid/tests/unittests/collective_concat_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9de1713e7282e61d78f1ebb43cd226a4ef1f19c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_concat_op.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveConcat(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofconcat",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_concat",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'rank': self.rank,
+                    'nranks': nranks
+                },
+                outputs={'Out': toutdata})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveConcat, "concat", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_identity_op.py b/python/paddle/fluid/tests/unittests/collective_identity_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e024b64e825095e4678a254bf72f95787a823f63
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_identity_op.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveIdentity(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_identity",
+                inputs={'X': tindata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveIdentity, "identity", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..551537a0ea4eab6638d69a1e6e787458d367f076
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata",
+                shape=[10, 1000],
+                dtype='float32',
+                append_batch_size=False)
+            if rank == 0:
+                paddle.distributed.send(tindata, dst=1)
+            else:
+                paddle.distributed.recv(tindata, src=0)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py
new file mode 100644
index 0000000000000000000000000000000000000000..10028488e85a22219ab5bb0aa1e7dd92c84d5d9b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank, indata=None):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = paddle.to_tensor(indata)
+            if rank == 0:
+                paddle.distributed.send(tindata, dst=1)
+            else:
+                paddle.distributed.recv(tindata, src=0)
+            return [tindata.numpy()]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
diff --git a/python/paddle/fluid/tests/unittests/collective_split_op.py b/python/paddle/fluid/tests/unittests/collective_split_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..553955354fe0294e5c1783754c6529105408d8c1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_split_op.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllGather(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofsplit",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_split",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'rank': self.rank,
+                    'nranks': nranks
+                },
+                outputs={'Out': toutdata})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllGather, "split", 0)
diff --git a/python/paddle/fluid/tests/unittests/dist_sharding_save.py b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
index 22c930bf8948aa41eacd9c68b870be7b69719ef7..676b15c0d93e761f7a331a4816d03005da0bf1df 100755
--- a/python/paddle/fluid/tests/unittests/dist_sharding_save.py
+++ b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
@@ -59,7 +59,11 @@ def runtime_main():
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.sharding = True
-            strategy.sharding_configs = {"fuse_broadcast_MB": 0.2}
+            strategy.sharding_configs = {
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 0.2,
+                "sharding_degree": 2,
+            }
 
             optimizer = paddle.fluid.optimizer.Momentum(
                 learning_rate=0.01, momentum=0.9)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index 29b4f1b05f9c2911b849b323674b3a704a1da297..ea745ad661425381811b2405362ce254b0403fe1 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -31,6 +31,10 @@ from paddle.fluid.dygraph.dygraph_to_static.utils import is_dygraph_api
 SEED = 2020
 np.random.seed(SEED)
 
+# TODO(zhhsplendid): This test is old so that use a static graph style
+# mark it as TODO, to refactoring the code of this file.
+paddle.enable_static()
+
 
 def dyfunc_to_variable(x):
     res = fluid.dygraph.to_variable(x, name=None, zero_copy=None)
@@ -54,11 +58,25 @@ def dyfunc_to_tensor(x):
     return res3
 
 
+def dyfunc_int_to_tensor(x):
+    res = paddle.to_tensor(3)
+    return res
+
+
+def dyfunc_float_to_tensor(x):
+    return paddle.to_tensor(2.0)
+
+
+def dyfunc_bool_to_tensor(x):
+    return paddle.to_tensor(True)
+
+
 class TestDygraphBasicApi_ToVariable(unittest.TestCase):
     def setUp(self):
         self.input = np.ones(5).astype("int32")
         self.test_funcs = [
-            dyfunc_to_tensor, dyfunc_to_variable, dyfunc_to_variable_2,
+            dyfunc_to_tensor, dyfunc_bool_to_tensor, dyfunc_int_to_tensor,
+            dyfunc_float_to_tensor, dyfunc_to_variable, dyfunc_to_variable_2,
             dyfunc_to_variable_3
         ]
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py
index 7a9bad1236f78161f31fd092785d6663244f7062..54dcc152fd6b281648991141973fc3a2b9a63f69 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py
@@ -191,29 +191,44 @@ class TestChooseShapeAttrOrApi(unittest.TestCase):
 
 
 class TestEvaIfExistElseNone(unittest.TestCase):
-    def test_locals(self):
+    def test_globals(self):
+        global x_shape
         x_shape = [1, 2, 3]
-        self.assertEqual(eval_if_exist_else_none('x_shape', locals()), x_shape)
+        self.assertEqual(eval_if_exist_else_none('x_shape', locals()), None)
+        self.assertEqual(eval_if_exist_else_none('x_shape', globals()), x_shape)
 
-    def test_globals(self):
+        del x_shape
+
+    def test_enclosing_scope(self):
+        global x_shape
         x_shape = [1, 2, 3]
 
         def foo():
-            x_shape = [2, 3, 4]
+            y_shape = [2, 3, 4]
+            self.assertEqual(
+                eval_if_exist_else_none('x_shape', globals()), [1, 2, 3])
             self.assertEqual(
-                eval_if_exist_else_none('x_shape', locals()), [2, 3, 4])
+                eval_if_exist_else_none('y_shape', locals()), [2, 3, 4])
 
         foo()
+        del x_shape
 
-    def test_invisible_of_func(self):
+    def test_global_in_func(self):
         x_shape = [1, 2, 3]
 
         def foo():
-            x_shape = [2, 3, 4]
-            return x_shape
+            global y_shape
+            y_shape = [2, 3, 4]
 
-        self.assertEqual(
-            eval_if_exist_else_none('x_shape', locals()), [1, 2, 3])
+            self.assertEqual(
+                eval_if_exist_else_none('y_shape', globals()), [2, 3, 4])
+            self.assertEqual(eval_if_exist_else_none('x_shape', locals()), None)
+            self.assertEqual(
+                eval_if_exist_else_none('x_shape', globals()), None)
+
+            del y_shape
+
+        foo()
 
     def test_none(self):
         def foo():
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
index 3a7994ee67e9bc7289fff65af3837cbb874b2566..dbd3952991cfd745e3ef6ba231d15feb5020099f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -241,5 +241,39 @@ class TestDictPop(TestNetWithDict):
                                                                static_result))
 
 
+class TestDictCmpInFor(unittest.TestCase):
+    def test_with_for(self):
+        def func():
+            pos = [1, 3]
+            neg = [-1, -3]
+            dict_val = {'minus': 0}
+            # test `zip` with `for`
+            for (x, y) in zip(pos, neg):
+                val = x - y
+                dict_val.update(
+                    {k: val + dict_val[k]
+                     for k, v in dict_val.items()})
+
+            return dict_val
+
+        self.assertEqual(paddle.jit.to_static(func)()['minus'], 8)
+
+    def test_with_for_enumerate(self):
+        def func():
+            pos = [1, 3]
+            neg = [-1, -3]
+            dict_val = {'minus': 0}
+            # test `zip` with `for`
+            for i, (x, y) in enumerate(zip(pos, neg)):
+                val = x - y
+                dict_val.update(
+                    {k: val + dict_val[k]
+                     for k, v in dict_val.items()})
+
+            return dict_val
+
+        self.assertEqual(paddle.jit.to_static(func)()['minus'], 8)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
index c28997c5c1c673c4d7774610dd24036074a1b54c..517cff39a276f490c2d4b6a0d951d40dc00f57b3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py
@@ -233,6 +233,7 @@ def for_iter_var_idx(x_array):
     return z
 
 
+# 17. for a,b,c in z: (a, b, c) is a tuple
 @paddle.jit.to_static
 def for_tuple_as_iter_var(x_array):
     x = paddle.to_tensor(x_array)
@@ -250,6 +251,7 @@ def for_tuple_as_iter_var(x_array):
     return a_result, b_result, c_result
 
 
+# 18. for t in enumerate(collection): t is tuple of (idx, element)
 @paddle.jit.to_static
 def for_tuple_as_enumerate_iter(x_array):
     x = paddle.to_tensor(x_array)
@@ -263,6 +265,7 @@ def for_tuple_as_enumerate_iter(x_array):
     return a_result
 
 
+# 19. for i, (a, b, c, d, e) in enumerate(collection): (a, b, c, d, e) is a tuple
 @paddle.jit.to_static
 def for_tuple_as_enumerate_value(x_array):
     x = paddle.to_tensor(x_array)
@@ -284,6 +287,23 @@ def for_tuple_as_enumerate_value(x_array):
     return a_result
 
 
+# 20. test for function in a class
+class ForwardContainsForLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(ForwardContainsForLayer, self).__init__()
+        self.high = 5
+        self.low = 3
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        # just for test case, x is useless in this method
+        y = paddle.zeros([10, 2, 3])
+        z = []
+        for i in range(self.high - self.low):
+            z.append(y[i].clone())
+        return z
+
+
 class TestTransformBase(unittest.TestCase):
     def setUp(self):
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
@@ -313,11 +333,11 @@ class TestTransformBase(unittest.TestCase):
 class TestTransform(TestTransformBase):
     def transformed_result_compare(self):
         dy_outs = self.get_dygraph_output()
-        if not isinstance(dy_outs, tuple):
+        if not isinstance(dy_outs, (tuple, list)):
             dy_outs = (dy_outs, )
 
         st_outs = self.get_static_output()
-        if not isinstance(st_outs, tuple):
+        if not isinstance(st_outs, (tuple, list)):
             st_outs = (st_outs, )
 
         for x, y in zip(dy_outs, st_outs):
@@ -446,5 +466,10 @@ class TestForTupleAsEnumerateValue(TestForIterVarNumpy):
         self.dygraph_func = for_tuple_as_enumerate_value
 
 
+class TestForwardContainsForLayer(TestForIterVarNumpy):
+    def set_test_func(self):
+        self.dygraph_func = ForwardContainsForLayer()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index 419150345b8f4c36854767640d01a93aba5f170e..5db1bb2a384f582c30a7877e49745cd9582e096e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -342,5 +342,28 @@ class TestDiffModeNet2(TestDiffModeNet):
         self.Net = DiffModeNet2
 
 
+class TestNewVarCreateInOneBranch(unittest.TestCase):
+    def test_var_used_in_another_for(self):
+        def case_func(training):
+            # targets and targets_list is dynamically defined by training
+            if training:
+                targets = [1, 2, 3]
+                targets_list = [targets]
+
+            num_step = 3
+            for i in range(num_step):
+                if i > 0:
+                    rois, rosi_num = 1, 2
+                    # targets is in loop_vars.
+                    if training:
+                        ros, rosi_num, targets = -1, -2, [-1, -2, -3]
+                        targets_list.append(targets)
+
+            return rosi_num
+
+        self.assertEqual(paddle.jit.to_static(case_func)(False), 2)
+        self.assertEqual(paddle.jit.to_static(case_func)(True), -2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index be571aaf2b75dd2a569b44b23e928906b2eaf196..70749c2e24447e67f267dcfe396dec18d2dcebab 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -541,5 +541,27 @@ class TestChangeShapeAfterAssign(TestTensorShapeBasic):
         self.expected_slice_op_num = 2
 
 
+def dyfunc_with_static_convert_var_shape(x):
+    # Note: this will create `batch_size__static_convert_var_shape_suffix_0` firstly.
+    batch_size = x.shape[0]
+    if len(x.shape) < 1:
+        res = x
+    else:
+        # Test for correctly to find `batch_size__static_convert_var_shape_suffix_0` in
+        # deeply nested scope.
+        res = fluid.layers.fill_constant(
+            value=8, shape=[batch_size], dtype="int32")
+
+    return res
+
+
+class TestFindStatiConvertVarShapeSuffixVar(unittest.TestCase):
+    def test(self):
+        x_spec = paddle.static.InputSpec(shape=[None, 10])
+        func = paddle.jit.to_static(dyfunc_with_if_2, input_spec=[x_spec])
+        # Call this function to trigger program translation.
+        func.concrete_program
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index 1fee1c1ef6fdc381a89a377476571e0dffae443a..07e9b1ac62e27060796950d34d1222ecf824ad9e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -54,7 +54,7 @@ class PrePostProcessLayer(Layer):
                 self.functors.append(
                     self.add_sublayer(
                         "layer_norm_%d" % len(
-                            self.sublayers(include_sublayers=False)),
+                            [layer for layer in self.children()]),
                         LayerNorm(
                             normalized_shape=d_model,
                             param_attr=fluid.ParamAttr(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index 528e388f6a2e2ac09a5fc67704b6db03f51a90ea..bb95bdf9fc6770d0a4ded31023cbb236b5a5ea70 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -97,7 +97,7 @@ cfg.batch_size = 1 if sys.platform == 'darwin' or os.name == 'nt' else 4
 # derived learning rate the to get the final learning rate.
 cfg.learning_rate = 0.001
 # maximum number of iterations
-cfg.max_iter = 20 if fluid.is_compiled_with_cuda() else 2
+cfg.max_iter = 20 if fluid.is_compiled_with_cuda() else 1
 # Disable mixup in last N iter
 cfg.no_mixup_iter = 10 if fluid.is_compiled_with_cuda() else 1
 # warm up to learning rate 
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
index 1c74a11cc4d2e63570ecdc1f420fbd403008cfa3..730fa4ca60d31ea03428e44cd40096ea9fd16bb4 100755
--- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -50,6 +50,38 @@ class TestFleetMetaOptimizer(unittest.TestCase):
                 strategy = paddle.distributed.fleet.DistributedStrategy()
         return avg_cost, strategy
 
+    def pp_net(self, main_prog, startup_prog, pp_degree=2):
+        def fc_block(input_x):
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            fc_3 = paddle.fluid.layers.fc(input=fc_2, size=64, act='tanh')
+            return fc_3
+
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.unique_name.guard():
+                role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+                fleet.init(role)
+                with fluid.device_guard("gpu:0"):
+                    input_x = paddle.fluid.layers.data(
+                        name="x", shape=[32], dtype='float32')
+                    input_y = paddle.fluid.layers.data(
+                        name="y", shape=[1], dtype='int64')
+
+                for stage_idx in range(pp_degree):
+                    with fluid.device_guard("gpu:" + str(stage_idx)):
+                        input_x = fc_block(input_x)
+
+                with fluid.device_guard("gpu:" + str(pp_degree - 1)):
+                    prediction = paddle.fluid.layers.fc(input=[input_x],
+                                                        size=2,
+                                                        act='softmax')
+                    cost = paddle.fluid.layers.cross_entropy(
+                        input=prediction, label=input_y)
+                    avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        return avg_cost, strategy
+
     def optimizer(self,
                   loss,
                   strategy,
@@ -146,7 +178,11 @@ class TestFleetMetaOptimizer(unittest.TestCase):
             strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
         elif name == "sharding":
             strategy.sharding = True
-            strategy.sharding_configs = {"fuse_broadcast_MB": 0.2}
+            strategy.sharding_configs = {
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 0.2,
+                "sharding_degree": 2,
+            }
         elif name == "recompute-offload":
             strategy.recompute = True
             strategy.recompute_configs = {
diff --git a/python/paddle/fluid/tests/unittests/hccl_tools.py b/python/paddle/fluid/tests/unittests/hccl_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3628ee5a4e9b4bf51950fccd424e17a69883eec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hccl_tools.py
@@ -0,0 +1,174 @@
+# -*- coding:UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""generate hccl config file script"""
+import os
+import sys
+import json
+import socket
+from argparse import ArgumentParser
+from typing import Dict, Any
+
+
+def parse_args():
+    """
+    parse args .
+
+    Args:
+
+    Returns:
+        args.
+
+    Examples:
+        >>> parse_args()
+    """
+    parser = ArgumentParser(description="mindspore distributed training launch "
+                            "helper utilty that will generate hccl"
+                            " config file")
+    parser.add_argument(
+        "--device_num",
+        type=str,
+        default="[0,8)",
+        help="The number of the Ascend accelerators used. please note that the Ascend accelerators"
+        "used must be continuous, such [0,4) means to use four chips "
+        "0,1,2,3; [0,1) means to use chip 0; The first four chips are"
+        "a group, and the last four chips are a group. In addition to"
+        "the [0,8) chips are allowed, other cross-group such as [3,6)"
+        "are prohibited.")
+    parser.add_argument(
+        "--visible_devices",
+        type=str,
+        default="0,1,2,3,4,5,6,7",
+        help="will use the visible devices sequentially")
+    parser.add_argument("--server_ip", type=str, default="", help="server ip")
+    args = parser.parse_args()
+    return args
+
+
+def get_host_ip():
+    """
+    get host ip
+    """
+    ip = None
+
+    try:
+        hostname = socket.gethostname()
+        ip = socket.gethostbyname(hostname)
+    except EOFError:
+        pass
+
+    return ip
+
+
+def main():
+    print("start", __file__)
+    args = parse_args()
+
+    # visible_devices
+    visible_devices = args.visible_devices.split(',')
+    print('visible_devices:{}'.format(visible_devices))
+
+    # server_id
+    ip = get_host_ip()
+    if args.server_ip:
+        server_id = args.server_ip
+    elif ip:
+        server_id = ip
+    else:
+        raise ValueError("please input server ip!")
+    print('server_id:{}'.format(server_id))
+
+    # device_num
+    first_num = int(args.device_num[1])
+    last_num = int(args.device_num[3])
+    if first_num < 0 or last_num > 8:
+        raise ValueError("device num {} must be in range [0,8] !".format(
+            args.device_num))
+    if first_num > last_num:
+        raise ValueError(
+            "First num {} of device num {} must less than last num {} !".format(
+                first_num, args.device_num, last_num))
+    if first_num < 4:
+        if last_num > 4:
+            if first_num == 0 and last_num == 8:
+                pass
+            else:
+                raise ValueError(
+                    "device num {} must be in the same group of [0,4] or [4,8] !".
+                    format(args.device_num))
+
+    device_num_list = list(range(first_num, last_num))
+    print("device_num_list:", device_num_list)
+
+    assert len(visible_devices) >= len(device_num_list)
+
+    # construct hccn_table
+    device_ips = {}
+    with open('/etc/hccn.conf', 'r') as fin:
+        for hccn_item in fin.readlines():
+            if hccn_item.strip().startswith('address_'):
+                device_id, device_ip = hccn_item.split('=')
+                device_id = device_id.split('_')[1]
+                device_ips[device_id] = device_ip.strip()
+
+    hccn_table = {'version': '1.0', 'server_count': '1', 'server_list': []}
+    device_list = []
+    rank_id = 0
+    for instance_id in device_num_list:
+        device_id = visible_devices[instance_id]
+        device_ip = device_ips[device_id]
+        device = {
+            'device_id': device_id,
+            'device_ip': device_ip,
+            'rank_id': str(rank_id)
+        }
+        print('rank_id:{}, device_id:{}, device_ip:{}'.format(
+            rank_id, device_id, device_ip))
+        rank_id += 1
+        device_list.append(device)
+    hccn_table['server_list'].append({
+        'server_id': server_id,
+        'device': device_list,
+        'host_nic_ip': 'reserve'
+    })
+    hccn_table['status'] = 'completed'
+
+    # save hccn_table to file
+    table_path = os.getcwd()
+    table_fn = os.path.join(table_path, 'hccl_{}p_{}_{}.json'.format(
+        len(device_num_list), "".join(map(str, device_num_list)), server_id))
+    with open(table_fn, 'w') as table_fp:
+        json.dump(hccn_table, table_fp, indent=4)
+    sys.stdout.flush()
+    print("Completed: hccl file was save in :", table_fn)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a9785475b561a9f35ecd34532558f176cf77e03
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_communicate_group.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+from paddle.distributed import fleet
+
+
+class TestNewGroupAPI(object):
+    def __init__(self):
+        paddle.distributed.init_parallel_env()
+        topo = fleet.CommunicateTopology(["data", "model", "pipe"], [2, 1, 1])
+        self.hcg = fleet.HybridCommunicateGroup(topo)
+
+        d1 = np.array([1, 2, 3])
+        d2 = np.array([2, 3, 4])
+        self.tensor1 = paddle.to_tensor(d1)
+        self.tensor2 = paddle.to_tensor(d2)
+
+    def test_all(self):
+        topo = self.hcg.topology()
+        global_rank = self.hcg.get_data_parallel_rank()
+
+        dp_rank = self.hcg.get_data_parallel_rank()
+        dp_gp = self.hcg.get_data_parallel_group()
+        dp_world_size = self.hcg.get_data_parallel_world_size()
+        dp_src_rank = self.hcg.get_data_parallel_group_src_rank()
+        np.testing.assert_array_equal(dp_world_size, 2)
+        np.testing.assert_array_equal(dp_src_rank, 0)
+
+        mp_rank = self.hcg.get_model_parallel_rank()
+        mp_gp = self.hcg.get_model_parallel_group()
+        mp_world_size = self.hcg.get_model_parallel_world_size()
+        mp_src_rank = self.hcg.get_model_parallel_group_src_rank()
+        np.testing.assert_array_equal(mp_world_size, 1)
+
+        tmp = np.array([0, 0, 0])
+        result = paddle.to_tensor(tmp)
+        paddle.distributed.scatter(
+            result, [self.tensor2, self.tensor1],
+            src=dp_src_rank,
+            group=dp_gp,
+            use_calc_stream=True)
+        if dp_rank == 0:
+            assert np.array_equal(result, self.tensor2)
+        elif dp_rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test scatter api ok")
+
+        paddle.distributed.broadcast(
+            result, src=1, group=dp_gp, use_calc_stream=True)
+        assert np.array_equal(result, self.tensor1)
+        print("test broadcast api ok")
+
+        paddle.distributed.reduce(
+            result, dst=dp_src_rank, group=dp_gp, use_calc_stream=True)
+        if dp_rank == 0:
+            assert np.array_equal(result,
+                                  paddle.add(self.tensor1, self.tensor1))
+        elif dp_rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test reduce api ok")
+
+        paddle.distributed.all_reduce(result, use_calc_stream=True)
+        assert np.array_equal(
+            result,
+            paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1))
+        print("test all_reduce api ok")
+
+        paddle.distributed.wait(result, dp_gp, use_calc_stream=True)
+        paddle.distributed.wait(result, dp_gp, use_calc_stream=False)
+        print("test wait api ok")
+
+        result = []
+        paddle.distributed.all_gather(
+            result, self.tensor1, group=dp_gp, use_calc_stream=True)
+        assert np.array_equal(result[0], self.tensor1)
+        assert np.array_equal(result[1], self.tensor1)
+        print("test all_gather api ok")
+
+        paddle.distributed.barrier(group=dp_gp)
+        print("test barrier api ok")
+
+        return
+
+
+if __name__ == "__main__":
+    gpt = TestNewGroupAPI()
+    gpt.test_all()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..248c271eec6a1bd139e0727b7a824aaa2f4269bf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+from hybrid_parallel_mp_model import TestDistMPTraning
+import paddle.distributed.fleet as fleet
+import unittest
+
+
+class TestMPClipGrad(TestDistMPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(
+            learning_rate=0.001, gamma=0.999, verbose=True)
+        optimizer = paddle.optimizer.SGD(scheduler,
+                                         grad_clip=grad_clip,
+                                         parameters=model.parameters())
+        return optimizer
+
+    def train_batch(self, batch, model, optimizer, is_mp):
+        scaler = paddle.amp.GradScaler(init_loss_scaling=5160)
+        if is_mp:
+            scaler = fleet.distributed_scaler(scaler)
+        with paddle.amp.auto_cast():
+            output = model(batch)
+            loss = output.mean()
+
+        scaled = scaler.scale(loss)  # scale the loss
+        scaled.backward()  # do backward
+
+        scaler.minimize(optimizer, scaled)  # update parameters
+        optimizer.clear_grad()
+        return scaled
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad95aceaa2cf9ce47d42d8bda664837f5859e681
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+from hybrid_parallel_mp_model import TestDistMPTraning
+import unittest
+import logging
+
+#log = logging.getLogger("HybridParallel")
+#log.setLevel(logging.WARNING)
+
+
+class TestMPClipGrad(TestDistMPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(
+            learning_rate=0.001, gamma=0.999, verbose=True)
+        optimizer = paddle.optimizer.SGD(scheduler,
+                                         grad_clip=grad_clip,
+                                         parameters=model.parameters())
+        return optimizer
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfbef998a2f07ab697d27b19197b3cb65cb41205
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -0,0 +1,273 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle import framework
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+    fleet.meta_parallel.model_parallel_random_seed(seed)
+
+
+class ColumnLinearNet(fluid.dygraph.Layer):
+    def __init__(self, input_size, output_size, global_dtype):
+        super(ColumnLinearNet, self).__init__()
+        self.parallel_linear = fleet.meta_parallel.ColumnParallelLinear(
+            in_features=input_size,
+            out_features=output_size,
+            weight_attr=None,
+            has_bias=True,
+            gather_output=True,
+            name="test_column_linear")
+
+    def forward(self, x):
+        output = self.parallel_linear(x)
+        return output
+
+
+class RowLinearNet(fluid.dygraph.Layer):
+    def __init__(self, input_size, output_size):
+        super(RowLinearNet, self).__init__()
+        self.parallel_linear = fleet.meta_parallel.RowParallelLinear(
+            in_features=input_size,
+            out_features=output_size,
+            has_bias=True,
+            input_is_parallel=False,
+            name="test_row_linear")
+
+    def forward(self, x):
+        output = self.parallel_linear(x)
+        return output
+
+
+class EmbeddingNet(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size):
+        super(EmbeddingNet, self).__init__()
+        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(vocab_size,
+                                                                    hidden_size)
+
+    def forward(self, x):
+        output = self.embedding(x)
+        return output
+
+
+class SimpleMatmul(fluid.dygraph.Layer):
+    def __init__(self, weight, output_size, global_dtype):
+        super(SimpleMatmul, self).__init__()
+        self.weight = paddle.create_parameter(
+            shape=weight.shape,
+            dtype=global_dtype,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(weight)))
+        self.bias = self.create_parameter(
+            shape=[output_size],
+            dtype=global_dtype,
+            attr=paddle.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+    def forward(self, x):
+        output = paddle.matmul(x, self.weight) + self.bias
+        return output
+
+
+class SimpleEmbedding(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size, weight):
+        super(SimpleEmbedding, self).__init__()
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                name="origin_embedding",
+                initializer=paddle.nn.initializer.Assign(weight)))
+
+    def forward(self, x):
+        output = self.embedding(x)
+        return output
+
+
+class TestDistTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_column_parallel_layer(self):
+        set_random_seed(1024)
+        global_dtype = "float32"
+
+        input_size_per_card = 17
+        input_size = input_size_per_card * self.model_parallel_size
+        output_size_per_card = 13
+        output_size = output_size_per_card * self.model_parallel_size
+        batch_size = 4
+
+        model_a = ColumnLinearNet(input_size, output_size, global_dtype)
+
+        # get w
+        check_group = dist.new_group(list(range(self.model_parallel_size)))
+        integral_w = []
+        partial_w = model_a.parallel_linear.weight.clone().detach()
+        paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
+        integral_w = paddle.concat(integral_w, axis=1)
+
+        model_b = SimpleMatmul(integral_w, output_size, global_dtype)
+
+        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_a.parameters())
+        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_b.parameters())
+        for idx in range(5):
+            input = paddle.randn([batch_size, input_size], global_dtype)
+            input.stop_gradient = True
+
+            output_a = model_a(input)
+            loss_a = output_a.mean()
+            loss_a.backward()
+
+            output_b = model_b(input)
+            loss_b = output_b.mean()
+            loss_b.backward()
+
+            optimizer_a.step()
+            optimizer_b.step()
+
+            np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
+
+    def test_row_parallel_layer(self):
+        global_dtype = "float32"
+        paddle.set_default_dtype(global_dtype)
+        set_random_seed(1024)
+
+        self.hcg = fleet.get_hybrid_communicate_group()
+
+        self.word_size = self.hcg.get_model_parallel_world_size()
+        self.rank_id = self.hcg.get_model_parallel_rank()
+
+        input_size_per_card = 11
+        input_size = input_size_per_card * self.model_parallel_size
+        output_size_per_card = 10
+        output_size = output_size_per_card * self.model_parallel_size
+        batch_size = 4
+
+        model_a = RowLinearNet(input_size, output_size)
+
+        # get w
+        check_group = dist.new_group(list(range(self.model_parallel_size)))
+        integral_w = []
+        partial_w = model_a.parallel_linear.weight.clone().detach()
+        paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
+        integral_w = paddle.concat(integral_w, axis=0)
+
+        model_b = SimpleMatmul(integral_w, output_size, global_dtype)
+
+        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_a.parameters())
+
+        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_b.parameters())
+
+        for idx in range(5):
+            input = paddle.randn([batch_size, input_size], global_dtype)
+            input.stop_gradient = True
+
+            output_a = model_a(input)
+            loss_a = output_a.mean()
+            loss_a.backward()
+
+            output_b = model_b(input)
+            loss_b = output_b.mean()
+            loss_b.backward()
+
+            optimizer_a.step()
+            optimizer_b.step()
+
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+
+    def test_parallel_embedding(self):
+        batch_size = 17
+        seq_length = 23
+        vocab_size_per_card = 2
+        vocab_size = vocab_size_per_card * self.model_parallel_size
+        hidden_size = 2
+        seed = 1236
+
+        set_random_seed(seed)
+        rank_id = dist.get_rank()
+
+        # model_a
+        model_a = EmbeddingNet(vocab_size, hidden_size)
+
+        # model_b
+        check_group = dist.new_group(list(range(self.model_parallel_size)))
+        integral_w = []
+        partial_w = model_a.embedding.embedding.weight.clone().detach()
+        paddle.distributed.all_gather(integral_w, partial_w, group=check_group)
+        result_w = []
+        for idx in range(len(integral_w)):
+            tmp = paddle.gather(
+                integral_w[idx],
+                paddle.to_tensor(list(range(vocab_size_per_card))))
+            result_w.append(tmp)
+        integral_w = paddle.concat(result_w, axis=0)
+
+        model_b = SimpleEmbedding(vocab_size, hidden_size, integral_w)
+
+        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_a.parameters())
+
+        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
+                                           parameters=model_b.parameters())
+
+        for _ in range(5):
+            np_input_data = np.random.randint(0, vocab_size,
+                                              (batch_size, seq_length))
+            input_data = paddle.to_tensor(np_input_data, dtype="int32")
+
+            output_a = model_a(input_data)
+            loss_a = output_a.mean()
+
+            output_b = model_b(input_data)
+            loss_b = output_b.mean()
+
+            loss_a.backward()
+            loss_b.backward()
+
+            optimizer_a.step()
+            optimizer_b.step()
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..767bf5d57e74aff64d13170267785c6a8ed4347b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle.io import DataLoader, Dataset
+import unittest
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + rank_id)
+
+
+vocab_size = 5
+hidden_size = 10
+inner_size = 8
+output_size = 2
+seq_length = 2
+
+
+class SimpleMPNet(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
+                 np_fc2, mp_id):
+        super(SimpleMPNet, self).__init__()
+
+        if mp_id == 0:
+            init_fc1_data = np_fc1[:, :(inner_size // 2)]
+            init_fc2_data = np_fc2[:(inner_size // 2), :]
+        else:
+            init_fc1_data = np_fc1[:, (inner_size // 2):]
+            init_fc2_data = np_fc2[(inner_size // 2):, :]
+
+        self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+            hidden_size,
+            inner_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(init_fc1_data)),
+            gather_output=False,
+            has_bias=True)
+
+        self.linear2 = fleet.meta_parallel.RowParallelLinear(
+            inner_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(init_fc2_data)),
+            input_is_parallel=True,
+            has_bias=True)
+
+        self.linear3 = paddle.nn.Linear(
+            hidden_size,
+            output_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.nn.initializer.Constant(value=0.5))
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+class SimpleDPNet(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
+                 np_fc2):
+
+        super(SimpleDPNet, self).__init__()
+        self.linear1 = paddle.nn.Linear(
+            hidden_size,
+            inner_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc1)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.linear2 = paddle.nn.Linear(
+            inner_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc2)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.linear3 = paddle.nn.Linear(
+            hidden_size,
+            output_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.nn.initializer.Constant(value=0.5))
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+class TrainDataset(Dataset):
+    def __init__(self, length):
+        self.length = length
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, index):
+        np_input_data = np.random.randint(0, vocab_size, (seq_length, ))
+        return np_input_data
+
+
+class TestDistMPTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        self.data_parallel_size = 1
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def train_batch(self, batch, model, optimizer, is_mp):
+        output = model(batch)
+        loss = output.mean()
+        loss.backward()  # do backward
+        optimizer.step()  # update parameters
+        optimizer.clear_grad()
+        return loss
+
+    def build_optimizer(self, model):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return optimizer
+
+    def build_model_optimizer(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        mp_id = hcg.get_model_parallel_rank()
+        dp_id = hcg.get_data_parallel_rank()
+        rank_id = dist.get_rank()
+        set_random_seed(1024, dp_id, rank_id)
+
+        np_fc1 = np.random.random_sample((hidden_size, inner_size))
+        np_fc2 = np.random.random_sample((inner_size, hidden_size))
+
+        train_data = TrainDataset(length=10000)
+
+        train_batch_sampler = paddle.io.DistributedBatchSampler(
+            train_data,
+            batch_size=4,
+            shuffle=False,
+            num_replicas=self.data_parallel_size,
+            rank=dp_id)
+        train_data_loader = DataLoader(
+            dataset=train_data,
+            batch_sampler=train_batch_sampler,
+            num_workers=0,
+            return_list=True)
+
+        model_a = SimpleMPNet(vocab_size, hidden_size, inner_size, output_size,
+                              np_fc1, np_fc2, mp_id)
+        optimizer_a = self.build_optimizer(model_a)
+        model_a = fleet.distributed_model(model_a)
+        optimizer_a = fleet.distributed_optimizer(optimizer_a)
+
+        model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size,
+                              np_fc1, np_fc2)
+        optimizer_b = self.build_optimizer(model_b)
+
+        return model_a, optimizer_a, model_b, optimizer_b, train_data_loader
+
+    def test_mp_model(self):
+        model_a, optimizer_a, model_b, optimizer_b, train_data_loader = self.build_model_optimizer(
+        )
+
+        for step, batch in enumerate(train_data_loader):
+            if step > 5:
+                return
+
+            loss_a = self.train_batch(batch, model_a, optimizer_a, True)
+            loss_b = self.train_batch(batch, model_b, optimizer_b, False)
+
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
new file mode 100644
index 0000000000000000000000000000000000000000..59d24066946aa62a45a6c9ba4182ed6f4bbdcdba
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_random.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+import random
+
+
+class TestDistTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_cuda_rng_tracker(self):
+        seed_1 = 2021
+        seed_2 = 1024
+
+        size = [20, 15]
+
+        paddle.seed(seed_1)
+        target_11 = paddle.randn(size, "float32")
+        target_12 = paddle.randn(size, "float32")
+
+        paddle.seed(seed_2)
+        target_21 = paddle.randn(size, "float32")
+        target_22 = paddle.randn(size, "float32")
+
+        paddle.seed(seed_1)
+
+        fleet.meta_parallel.get_rng_state_tracker().add("test", seed_2)
+
+        result_11 = paddle.randn(size, "float32")
+
+        with fleet.meta_parallel.get_rng_state_tracker().rng_state("test"):
+            result_21 = paddle.randn(size, "float32")
+
+        result_12 = paddle.randn(size, "float32")
+
+        with fleet.meta_parallel.get_rng_state_tracker().rng_state("test"):
+            result_22 = paddle.randn(size, "float32")
+
+        np.testing.assert_allclose(result_11.numpy(), target_11.numpy())
+        np.testing.assert_allclose(result_12.numpy(), target_12.numpy())
+        np.testing.assert_allclose(result_21.numpy(), target_21.numpy())
+        np.testing.assert_allclose(result_22.numpy(), target_22.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3130cbf458467acfc70d38a438aa845c40584469
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+from paddle.distributed import fleet
+import copy
+from paddle.fluid.dygraph.container import Sequential
+import paddle.nn as nn
+from paddle.fluid.dygraph.layers import Layer
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+import paddle.nn.functional as F
+import unittest
+
+
+class AlexNet(Layer):
+    def __init__(self, num_classes=10):
+        super(AlexNet, self).__init__()
+        self.features = Sequential(
+            nn.Conv2D(
+                3, 64, kernel_size=11, stride=4, padding=5),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            nn.Conv2D(
+                64, 192, kernel_size=5, padding=2),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            nn.Conv2D(
+                192, 384, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                384, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                256, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2), )
+        self.classifier = nn.Linear(256, num_classes)
+        self.loss_fn = nn.loss.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        x = self.features(x)
+        x.flatten()
+
+        x = self.classifier(x)
+        return self.loss_fn(x, y)
+
+
+class AlexNetPipe(AlexNet):
+    def to_layers(self):
+        feat = [self.features[i] for i in range(len(self.features))]
+        loss_fn = [lambda x: x.flatten(), self.classifier]
+        feat.extend(loss_fn)
+        return feat
+
+
+class AlexNetPipeDesc(PipelineLayer):
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        decs = [
+            LayerDesc(
+                nn.Conv2D, 3, 64, kernel_size=11, stride=4, padding=5),
+            LayerDesc(nn.ReLU),
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(
+                nn.Conv2D, 64, 192, kernel_size=5, padding=2),
+            F.relu,
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(
+                nn.Conv2D, 192, 384, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.Conv2D, 384, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.Conv2D, 256, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            lambda x: x.flatten(),
+            LayerDesc(nn.Linear, 256, self.num_classes),  # classifier
+        ]
+        super(AlexNetPipeDesc, self).__init__(
+            layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+
+
+class TestPipeLayerAPI(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": 1,
+            "pp_degree": self.model_parallel_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+        self.hcg = fleet.get_hybrid_communicate_group()
+
+    def test_pipelayer_desc(self):
+        pipe_model = AlexNetPipeDesc(num_stages=self.model_parallel_size)
+        np.testing.assert_array_equal(len(pipe_model.parameters()), 6)
+
+    def test_pipelayer_sequential(self):
+        init_net = AlexNetPipe()
+        pipe_model = PipelineLayer(
+            layers=init_net.to_layers(),
+            num_stages=self.model_parallel_size,
+            loss_fn=nn.CrossEntropyLoss())
+        stage_id = self.hcg.get_stage_id()
+        init_parameters = init_net.parameters()
+        pipe_parameters = pipe_model.parameters()
+        part_number = len(init_parameters) // 2
+
+        if stage_id == 0:
+            for idx in range(part_number):
+                param_a = init_parameters[idx]
+                param_b = pipe_parameters[idx]
+                np.testing.assert_array_equal(param_a.name, param_b.name)
+                np.testing.assert_allclose(param_a.numpy(), param_b.numpy())
+
+        elif stage_id == 1:
+            for idx in range(part_number):
+                param_a = init_parameters[idx + part_number]
+                param_b = pipe_parameters[idx]
+
+                np.testing.assert_array_equal(param_a.name, param_b.name)
+                np.testing.assert_allclose(param_a.numpy(), param_b.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b9283a1a9b6ea9e92246db974f501170fc4cb50
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle.io import DataLoader, Dataset
+import unittest
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + rank_id)
+
+
+HIDDEN_DIM = 32
+LAYERS = 8
+
+
+def sequential_model():
+    model = paddle.nn.Sequential(
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, 1), )
+    return model
+
+
+class TestDistPPTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {"accumulate_steps": 2}
+        paddle.distributed.init_parallel_env()
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_mp_model(self):
+        batch_input = paddle.randn(shape=(1, HIDDEN_DIM), dtype="float32")
+        pipe_model = sequential_model()
+        sgd = paddle.optimizer.SGD(learning_rate=0.0003, parameters=[])
+        pipe_model = paddle.distributed.fleet.distributed_model(pipe_model)
+
+        if pipe_model.stage_id == 0 or pipe_model.stage_id == 1:
+            pipe_input = batch_input.clone().detach()
+            pipe_input = paddle.cast(pipe_input, 'float32')
+
+            def data_gen():
+                gen = True
+                while gen:
+                    yield [pipe_input, 0]
+                    gen = False
+
+            loader = paddle.io.DataLoader.from_generator(capacity=5)
+            loader.set_batch_generator(data_gen)
+            data_iter = iter(loader)
+        else:
+            data_iter = None
+        return True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index dfec1cc7572bea23da7c1aadc647d89884bd83a7..8e4c091cd01dd3a7ee72957e3e6e3a7661ac8b19 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -32,4 +32,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
 set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 993493a3ccf2b6fd28448b0059e5f648836deec3..010086bfbbc47ffe65b6379b65b05900235e83d3 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -46,6 +46,7 @@ class InferencePassTest(unittest.TestCase):
         self.enable_mkldnn = False
         self.enable_mkldnn_bfloat16 = False
         self.enable_trt = False
+        self.enable_tensorrt_oss = True
         self.trt_parameters = None
         self.dynamic_shape_params = None
         self.enable_lite = False
@@ -133,6 +134,8 @@ class InferencePassTest(unittest.TestCase):
                         self.dynamic_shape_params.max_input_shape,
                         self.dynamic_shape_params.optim_input_shape,
                         self.dynamic_shape_params.disable_trt_plugin_fp16)
+                if self.enable_tensorrt_oss:
+                    config.enable_tensorrt_oss()
 
         elif use_mkldnn:
             config.enable_mkldnn()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
index 95cff4de6f6b082f8507b159724fb50d401090e9..69a9ae3c0ad2c9ccb9163c678c02763d37e6ff13 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
@@ -40,9 +40,11 @@ class SquaredMatSubFusePassTest(InferencePassTest):
             matmul_ab_square = paddle.square(matmul_ab)
             matmul_square_ab = paddle.matmul(data_a_square, data_b_square)
 
-            scale = paddle.fluid.layers.fill_constant(shape=[1], value=0.5, dtype='float32')
+            scale = paddle.fluid.layers.fill_constant(
+                shape=[1], value=0.5, dtype='float32')
 
-            sub_val = paddle.fluid.layers.elementwise_sub(matmul_ab_square, matmul_square_ab)
+            sub_val = paddle.fluid.layers.elementwise_sub(matmul_ab_square,
+                                                          matmul_square_ab)
             squared_mat_sub_out = fluid.layers.elementwise_mul(sub_val, scale)
 
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..90cdf784b1fcf74c5f8221b8e4144e9a75b8324f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTAffineChannelTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 2
+        self.channel = 8
+        self.height = 16
+        self.width = 16
+        self.data_layout = 'NCHW'
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.enable_trt = True
+
+    def build(self):
+        # set min_graph_size to 2, 
+        # because affine channel doesn't support nhwc format
+        self.trt_parameters = InferencePassTest.TensorRTParam(
+            1 << 30, self.bs, 2, self.precision, self.serialize, False)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            if self.data_layout == 'NCHW':
+                shape = [-1, self.channel, self.height, self.width]
+            else:
+                shape = [-1, self.height, self.width, self.channel]
+
+            data = fluid.data(name='in', shape=shape, dtype='float32')
+            # set scale, bias by constant
+            scale = fluid.layers.create_parameter(
+                shape=[self.channel],
+                dtype='float32',
+                default_initializer=fluid.initializer.Constant(2.))
+            bias = fluid.layers.create_parameter(
+                shape=[self.channel],
+                dtype='float32',
+                default_initializer=fluid.initializer.Constant(.5))
+            affine_channel_out = fluid.layers.affine_channel(
+                data, scale=scale, bias=bias, data_layout=self.data_layout)
+            out = fluid.layers.batch_norm(affine_channel_out, is_test=True)
+
+        shape[0] = self.bs
+        self.feeds = {'in': np.random.random(shape).astype('float32'), }
+        self.fetch_list = [out]
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            atol = 1e-5
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
+                atol = 2e-2
+            self.check_output_with_option(use_gpu, atol, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def run_test_all(self):
+        precision_opt = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_opt = [False, True]
+
+        if self.data_layout == 'NCHW':
+            min_shape = [
+                self.bs, self.channel, self.height // 2, self.width // 2
+            ]
+            max_shape = [self.bs, self.channel, self.height * 2, self.width * 2]
+            opt_shape = [self.bs, self.channel, self.height, self.width]
+
+        if self.data_layout == 'NHWC':
+            min_shape = [
+                self.bs, self.height // 2, self.width // 2, self.channel
+            ]
+            max_shape = [self.bs, self.height * 2, self.width * 2, self.channel]
+            opt_shape = [self.bs, self.height, self.width, self.channel]
+
+        dynamic_shape_profile = InferencePassTest.DynamicShapeParam({
+            'in': min_shape
+        }, {'in': max_shape}, {'in': opt_shape}, False)
+        dynamic_shape_opt = [None, dynamic_shape_profile]
+
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_opt, serialize_opt, dynamic_shape_opt):
+            self.precision = precision
+            self.serialize = serialize
+            self.dynamic_shape_params = dynamic_shape
+            self.run_test()
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
+            'in': [self.bs, self.channel, self.height // 2, self.width // 2]
+        }, {'in': [self.bs, self.channel, self.height * 2, self.width * 2]
+            }, {'in': [self.bs, self.channel, self.height, self.width]}, False)
+        self.run_test()
+
+    def test_nchw_all(self):
+        self.run_test_all()
+
+    def test_nhwc(self):
+        self.data_layout = 'NHWC'
+        self.run_test_all()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6f1c2c45910d11c77f4121787115586acefd43
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTAnchorGeneratorBaseTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.channel = 16
+        self.height = 32
+        self.width = 32
+        self.anchor_sizes = [64., 128., 256., 512.]
+        self.aspect_ratios = [.5, 1., 2.]
+        self.variance = [.1, .1, .2, .2]
+        self.stride = [8., 8.]
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.enable_trt = True
+        self.feeds = {
+            'data':
+            np.random.random([self.bs, self.channel, self.height,
+                              self.width]).astype('float32'),
+        }
+
+    def build(self):
+        min_graph_size = 3 if self.dynamic_shape_params is not None else 2
+        self.trt_parameters = InferencePassTest.TensorRTParam(
+            1 << 30, self.bs, min_graph_size, self.precision, self.serialize,
+            False)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data',
+                shape=[-1, self.channel, self.height, self.width],
+                dtype='float32')
+            anchor, var = fluid.layers.detection.anchor_generator(
+                data,
+                anchor_sizes=self.anchor_sizes,
+                aspect_ratios=self.aspect_ratios,
+                variance=self.variance,
+                stride=self.stride)
+            if self.dynamic_shape_params is not None:
+                anchor = fluid.layers.transpose(anchor, [2, 3, 0, 1])
+            out = fluid.layers.batch_norm(anchor, is_test=True)
+
+        self.fetch_list = [out, var]
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def set_dynamic(self):
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
+            'data': [self.bs, self.channel, self.height // 2, self.width // 2]
+        }, {
+            'data': [self.bs, self.channel, self.height, self.width]
+        }, {'data': [self.bs, self.channel, self.height, self.width]}, False)
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_serialize(self):
+        self.serialize = True
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_fp16_serialize(self):
+        self.serialize = True
+        self.precision = AnalysisConfig.Precision.Half
+        self.set_dynamic()
+        self.run_test()
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            atol = 1e-5
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
+                atol = 1e-3
+            self.check_output_with_option(use_gpu, atol, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
index 0821b390e5e6ae08c4791dcaa628c4a48e8d024f..ec3955a9ae1441cdaa4efa5b0e87ff8b74a0b689 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
@@ -96,6 +96,7 @@ class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
                 groups=self.conv_groups,
                 padding=self.conv_padding,
                 bias_attr=False,
+                use_cudnn=self.use_cudnn,
                 act=None)
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
@@ -110,6 +111,7 @@ class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
         self.conv_filter_size = 6
         self.conv_groups = 1
         self.conv_padding = [1, 1]
+        self.use_cudnn = True
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -126,6 +128,7 @@ class TensorRTSubgraphPassConvTransposeValidPaddingTest(
         self.conv_filter_size = 6
         self.conv_groups = 1
         self.conv_padding = 'VALID'
+        self.use_cudnn = True
 
 
 class TensorRTSubgraphPassConvTransposeSamePaddingTest(
@@ -135,15 +138,27 @@ class TensorRTSubgraphPassConvTransposeSamePaddingTest(
         self.conv_filter_size = 6
         self.conv_groups = 1
         self.conv_padding = 'SAME'
+        self.use_cudnn = True
 
 
-class TensorRTSubgraphPassDepthwiseConvTransposeTest(
+class TensorRTSubgraphPassConvTransposeMultiGroupTest(
         TensorRTSubgraphPassConvTransposeTest):
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
-        self.conv_groups = 1
+        self.conv_groups = 2
+        self.conv_padding = [1, 1]
+        self.use_cudnn = True
+
+
+class TensorRTSubgraphPassDepthwiseConvTransposeTest(
+        TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 4
+        self.conv_groups = 6
         self.conv_padding = [1, 1]
+        self.use_cudnn = False
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd69a8bf6c37fa8283ff1ddd876e0e4e326b0bbe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_dynamic_shape.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTDynamicShapeTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 16, 16], dtype="float32")
+            out = fluid.layers.conv2d(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                groups=1,
+                padding=[1, 1],
+                bias_attr=False,
+                act=None)
+
+        self.feeds = self.set_feeds()
+        self.enable_trt = True
+        self.trt_parameters = TRTDynamicShapeTest.TensorRTParam(
+            1 << 30, 1, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = TRTDynamicShapeTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [1, 3, 32, 32]}, {'data': [1, 3, 16, 16]}, False)
+        self.fetch_list = [out]
+
+    def set_feeds(self):
+        return {"data": np.random.random([1, 3, 16, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+
+
+class TRTDynamicShapeOutOfBound1Test(TRTDynamicShapeTest):
+    def set_feeds(self):
+        return {"data": np.random.random([1, 3, 64, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            with self.assertRaises(Exception):
+                self.check_output_with_option(use_gpu)
+
+
+class TRTDynamicShapeOutOfBound2Test(TRTDynamicShapeTest):
+    def set_feeds(self):
+        return {"data": np.random.random([2, 3, 16, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            with self.assertRaises(Exception):
+                self.check_output_with_option(use_gpu)
+
+
+class TRTDynamicShapeOutOfBound3Test(TRTDynamicShapeTest):
+    def set_feeds(self):
+        return {"data": np.random.random([1, 3, 4, 16]).astype("float32"), }
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            with self.assertRaises(Exception):
+                self.check_output_with_option(use_gpu)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
index 48706bf5ad1fd985dfc3191286370983d0820730..cde2fa412d7050c19cdf4e185b8e5307c40021e3 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
@@ -31,10 +31,7 @@ class FCFusePassTRTTest(InferencePassTest):
                                       size=128,
                                       num_flatten_dims=1,
                                       act="relu")
-            fc_out2 = fluid.layers.fc(input=fc_out1,
-                                      size=32,
-                                      num_flatten_dims=1)
-            out = fluid.layers.softmax(input=fc_out2)
+            out = fluid.layers.softmax(input=fc_out1)
 
         self.feeds = {
             "data": np.random.random((32, 128, 2, 2)).astype("float32")
@@ -55,5 +52,236 @@ class FCFusePassTRTTest(InferencePassTest):
             self.check_output_with_option(use_gpu[i])
 
 
+class FCFusePassTRTStaticDims4Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 8], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 8)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTStaticDims4Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTStaticDims4Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[3, 24, 16, 16], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=32,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((3, 24, 16, 16)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTStaticDims4Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims2Test.DynamicShapeParam(
+            {
+                'data': [1, 128]
+            }, {'data': [64, 128]}, {'data': [32, 128]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims3Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols1Test.DynamicShapeParam(
+            {
+                'data': [1, 128, 32]
+            }, {'data': [64, 128, 32]}, {'data': [32, 128, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims3Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols2Test.DynamicShapeParam(
+            {
+                'data': [1, 32, 32]
+            }, {'data': [64, 256, 32]}, {'data': [32, 128, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 12, 4, 6], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 12, 4, 6)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols1Test.DynamicShapeParam(
+            {
+                'data': [1, 12, 4, 6]
+            }, {'data': [64, 12, 4, 6]}, {'data': [32, 12, 4, 6]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 32)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols2Test.DynamicShapeParam(
+            {
+                'data': [1, 64, 32, 32]
+            }, {'data': [64, 256, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=3,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 32)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols3Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols3Test.DynamicShapeParam(
+            {
+                'data': [1, 128, 32, 32]
+            }, {'data': [64, 128, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..fec15ea7295a0ff46fe5a4cce0012d3cf8dc21f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTGatherTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name='data', shape=[-1, 512], dtype='float32')
+            index = fluid.data(name='index', shape=[-1], dtype='int32')
+            scale_out = self.append_gather(data, index)
+            out = fluid.layers.batch_norm(scale_out, is_test=True)
+
+        index = np.arange(self.num_gather, dtype='int32')
+        np.random.shuffle(index)
+
+        self.feeds = {
+            "data": np.random.random([self.bs, 512]).astype("float32"),
+            "index": index,
+        }
+
+        self.enable_trt = True
+        self.trt_parameters = TRTGatherTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def set_params(self):
+        self.num_gather = 16
+        self.bs = 32
+
+    def append_gather(self, data, index):
+        return fluid.layers.gather(data, index=index)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTGatherTest1(TRTGatherTest):
+    def set_params(self):
+        self.num_gather = 32
+        self.bs = 32
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
index 94434f404344890d925ee03b914da9f2e742e97a..080d1ccc9054bca35316dd986eace00925327501 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -25,19 +25,16 @@ class TensorRTMatMulDims2Test(InferencePassTest):
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[24, 24], dtype="float32")
+            data = fluid.data(name="data", shape=[24, 24], dtype="float32")
             matmul_out = fluid.layers.matmul(
                 x=data,
                 y=data,
-                transpose_x = self.transpose_x,
-                transpose_y = self.transpose_y,
-                alpha = self.alpha)
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+                alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
-        self.feeds = {
-            "data": np.ones([24, 24]).astype("float32"),
-        }
+        self.feeds = {"data": np.ones([24, 24]).astype("float32"), }
         self.enable_trt = True
         self.trt_parameters = TensorRTMatMulDims2Test.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
@@ -65,14 +62,12 @@ class TensorRTMatMulTest(InferencePassTest):
             matmul_out = fluid.layers.matmul(
                 x=data,
                 y=data,
-                transpose_x = self.transpose_x,
-                transpose_y = self.transpose_y,
-                alpha = self.alpha)
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+                alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
-        self.feeds = {
-            "data": np.ones([1, 6, 24, 24]).astype("float32"),
-        }
+        self.feeds = {"data": np.ones([1, 6, 24, 24]).astype("float32"), }
         self.enable_trt = True
         self.trt_parameters = TensorRTMatMulTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ca6985985985e2a60ef8f6ff5a8ef8c2a129ec2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TensorRTMultiClassNMSTest(InferencePassTest):
+    def setUp(self):
+        self.enable_trt = True
+        self.enable_tensorrt_oss = True
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.bs = 1
+        self.background_label = -1
+        self.score_threshold = .5
+        self.nms_top_k = 8
+        self.nms_threshold = .3
+        self.keep_top_k = 8
+        self.normalized = False
+        self.num_classes = 8
+        self.num_boxes = 8
+        self.trt_parameters = InferencePassTest.TensorRTParam(
+            1 << 30, self.bs, 2, self.precision, self.serialize, False)
+
+    def build(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            boxes = fluid.data(
+                name='bboxes', shape=[-1, self.num_boxes, 4], dtype='float32')
+            scores = fluid.data(
+                name='scores',
+                shape=[-1, self.num_classes, self.num_boxes],
+                dtype='float32')
+            multiclass_nms_out = fluid.layers.multiclass_nms(
+                bboxes=boxes,
+                scores=scores,
+                background_label=self.background_label,
+                score_threshold=self.score_threshold,
+                nms_top_k=self.nms_top_k,
+                nms_threshold=self.nms_threshold,
+                keep_top_k=self.keep_top_k,
+                normalized=self.normalized)
+            mutliclass_nms_out = multiclass_nms_out + 1.
+            multiclass_nms_out = fluid.layers.reshape(
+                multiclass_nms_out, [self.bs, 1, self.keep_top_k, 6],
+                name='reshape')
+            out = fluid.layers.batch_norm(multiclass_nms_out, is_test=True)
+
+        boxes_data = np.arange(self.num_boxes * 4).reshape(
+            [self.bs, self.num_boxes, 4]).astype('float32')
+        scores_data = np.arange(1 * self.num_classes * self.num_boxes).reshape(
+            [self.bs, self.num_classes, self.num_boxes]).astype('float32')
+        self.feeds = {
+            'bboxes': boxes_data,
+            'scores': scores_data,
+        }
+        self.fetch_list = [out]
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def run_test_all(self):
+        precision_opt = [
+            AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half
+        ]
+        serialize_opt = [False, True]
+        max_shape = {
+            'bboxes': [self.bs, self.num_boxes, 4],
+            'scores': [self.bs, self.num_classes, self.num_boxes],
+        }
+        opt_shape = max_shape
+        dynamic_shape_opt = [
+            None, InferencePassTest.DynamicShapeParam({
+                'bboxes': [1, 1, 4],
+                'scores': [1, 1, 1]
+            }, max_shape, opt_shape, False)
+        ]
+        for precision, serialize, dynamic_shape in itertools.product(
+                precision_opt, serialize_opt, dynamic_shape_opt):
+            self.precision = precision
+            self.serialize = serialize
+            self.dynamic_shape_params = dynamic_shape
+            self.build()
+            self.check_output()
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        max_shape = {
+            'bboxes': [self.bs, self.num_boxes, 4],
+            'scores': [self.bs, self.num_classes, self.num_boxes],
+        }
+        opt_shape = max_shape
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
+            'bboxes': [1, 1, 4],
+            'scores': [1, 1, 1]
+        }, max_shape, opt_shape, False)
+        self.run_test()
+
+    def test_background(self):
+        self.background = 7
+        self.run_test()
+
+    def test_disable_oss(self):
+        self.diable_tensorrt_oss = False
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a58a6c9dda7d16824edc38776fdf243794e4391
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTNearestInterpTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            if self.data_layout == 'NCHW':
+                shape = [
+                    -1, self.channels, self.origin_shape[0],
+                    self.origin_shape[1]
+                ]
+            else:
+                shape = [
+                    -1, self.origin_shape[0], self.origin_shape[1],
+                    self.channels
+                ]
+            data = fluid.data(name='data', shape=shape, dtype='float32')
+            resize_out = self.append_nearest_interp(data)
+            out = fluid.layers.batch_norm(resize_out, is_test=True)
+
+        if self.data_layout == 'NCHW':
+            shape = [
+                self.bs, self.channels, self.origin_shape[0],
+                self.origin_shape[1]
+            ]
+        else:
+            shape = [
+                self.bs, self.origin_shape[0], self.origin_shape[1],
+                self.channels
+            ]
+
+        self.feeds = {'data': np.random.random(shape).astype('float32'), }
+        self.enable_trt = True
+        self.trt_parameters = TRTNearestInterpTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def set_params(self):
+        self.bs = 4
+        self.scale = 1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = True
+        self.data_layout = 'NCHW'
+
+    def append_nearest_interp(self, data):
+        if self.scale > 0.:
+            return fluid.layers.resize_nearest(
+                data,
+                scale=self.scale,
+                align_corners=self.align_corners,
+                data_format=self.data_layout)
+        return fluid.layers.resize_nearest(
+            data,
+            out_shape=self.resize_shape,
+            align_corners=self.align_corners,
+            data_format=self.data_layout)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTNearestInterpTest1(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = True
+        self.data_layout = 'NCHW'
+
+
+class TRTNearestInterpTest2(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = 2.
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = False
+        self.data_layout = 'NCHW'
+
+
+class TRTNearestInterpTest3(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = False
+        self.data_layout = 'NCHW'
+
+
+class TRTNearestInterpTest4(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (47, 48)  # HW
+        self.align_corners = False
+        self.data_layout = 'NCHW'
+
+
+class TRTNearestInterpTest5(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = True
+        self.data_layout = 'NHWC'
+
+
+class TRTNearestInterpTest6(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = 2.
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = False
+        self.data_layout = 'NHWC'
+
+
+class TRTNearestInterpTest7(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (64, 64)  # HW
+        self.align_corners = False
+        self.data_layout = 'NHWC'
+
+
+class TRTNearestInterpTest8(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (47, 48)  # HW
+        self.align_corners = False
+        self.data_layout = 'NHWC'
+
+
+class TRTNearestInterpTest9(TRTNearestInterpTest):
+    def set_params(self):
+        self.bs = 4
+        self.scale = -1
+        self.channels = 3
+        self.origin_shape = (32, 32)  # HW
+        self.resize_shape = (47, 48)  # HW
+        self.align_corners = False
+        self.data_layout = 'NHWC'
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa276dd342bc6bb3fc0f3d1c5d8b3a6480458a2b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTRoiAlignTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 2
+        self.num_rois = 4
+        self.channel = 16
+        self.height = 32
+        self.width = 32
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.enable_trt = True
+
+    def build(self):
+        self.trt_parameters = TRTRoiAlignTest.TensorRTParam(
+            1 << 30, self.bs * self.num_rois, 1, self.precision, self.serialize,
+            False)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data_shape = [-1, self.channel, self.height, self.width]
+            data = fluid.data(name='data', shape=data_shape, dtype='float32')
+            rois = fluid.data(
+                name='rois', shape=[-1, 4], dtype='float32', lod_level=1)
+            roi_align_out = fluid.layers.roi_align(data, rois)
+            out = fluid.layers.batch_norm(roi_align_out, is_test=True)
+
+        rois_lod = fluid.create_lod_tensor(
+            np.random.random([self.bs * self.num_rois, 4]).astype('float32'),
+            [[self.num_rois, self.num_rois]], fluid.CPUPlace())
+
+        data_shape[0] = self.bs
+        self.feeds = {
+            'data': np.random.random(data_shape).astype('float32'),
+            'rois': rois_lod,
+        }
+        self.fetch_list = [out]
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            atol = 1e-5
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
+                atol = 1e-3
+            self.check_output_with_option(use_gpu, atol, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+    def set_dynamic(self):
+        min_shape_spec = dict()
+        max_shape_spec = dict()
+        opt_shape_spec = dict()
+        min_shape_spec['data'] = [
+            self.bs, self.channel, self.height // 2, self.width // 2
+        ]
+        min_shape_spec['rois'] = [1, 4]
+        max_shape_spec[
+            'data'] = [self.bs, self.channel, self.height * 2, self.width * 2]
+        max_shape_spec['rois'] = [self.bs * self.num_rois, 4]
+        opt_shape_spec[
+            'data'] = [self.bs, self.channel, self.height, self.width]
+        opt_shape_spec['rois'] = [self.bs * self.num_rois, 4]
+
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam(
+            min_shape_spec, max_shape_spec, opt_shape_spec, False)
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_fp16(self):
+        self.set_dynamic()
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_dynamic_serialize(self):
+        self.set_dynamic()
+        self.serialize = True
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
index 67a1253b2cd02e94a931abc34ddd652fd91ba62f..4530e04d8de63a827e2adb99f80a5ea5a13d211e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py
@@ -48,5 +48,33 @@ class TRTScaleTest(InferencePassTest):
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
+class TRTScaleShape2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 512, 512], dtype="float32")
+            scale_out = self.append_scale(data)
+            out = fluid.layers.batch_norm(scale_out, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([1, 512, 512]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTScaleShape2Test.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def append_scale(self, data):
+        return fluid.layers.scale(
+            x=data, scale=2.0, bias=-1.0, bias_after_scale=False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index 2c77ce1723129471ae71dbef3e9acb69699ea0df..d895ac44d89319c396b95af1994c5d99f4555ea5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -262,7 +262,6 @@ class TensorRTSubgraphPassInstanceNormTest(InferencePassTest):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
                 name="data", shape=[-1, 3, 64, 64], dtype="float32")
-            fc_out = fluid.layers.fc(input=data, size=200)
             param_attr = fluid.ParamAttr(
                 name='instance_norm_w',
                 initializer=fluid.initializer.Constant(value=1.0))
@@ -270,7 +269,7 @@ class TensorRTSubgraphPassInstanceNormTest(InferencePassTest):
                 name='instance_norm_b',
                 initializer=fluid.initializer.Constant(value=0.0))
             out = fluid.layers.instance_norm(
-                input=fc_out, param_attr=param_attr, bias_attr=bias_attr)
+                input=data, param_attr=param_attr, bias_attr=bias_attr)
         self.feeds = {
             "data": np.random.random([1, 3, 64, 64]).astype("float32"),
         }
@@ -414,6 +413,58 @@ class TensorRTSubgraphPassElementwiseMulTest(
         return fluid.layers.elementwise_mul(x=data1, y=data2)
 
 
+class TensorRTSubgraphPassElementwiseSerializeTest(
+        TensorRTSubgraphPassElementwiseTest):
+    def setUp(self):
+        super(TensorRTSubgraphPassElementwiseSerializeTest, self).setUp()
+        self.trt_parameters = TensorRTSubgraphPassElementwiseTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
+
+    def test_check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        super(TensorRTSubgraphPassElementwiseSerializeTest,
+              self).test_check_output()
+
+
+class TensorRTSubgraphPassElementwiseBroadcastDynamicTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data1 = fluid.data(
+                name="data1", shape=[-1, 3, 64, 64], dtype="float32")
+            data2 = fluid.data(name="data2", shape=[64, 64], dtype="float32")
+            eltwise_out = self.append_eltwise(data1, data2)
+            out = fluid.layers.batch_norm(eltwise_out, is_test=True)
+        self.feeds = {
+            "data1": np.random.random([1, 3, 64, 64]).astype("float32"),
+            "data2": np.random.random([64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassElementwiseBroadcastDynamicTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassElementwiseBroadcastDynamicTest.DynamicShapeParam(
+            {
+                'data1': [1, 3, 8, 64],
+                'data2': [8, 64]
+            }, {'data1': [1, 3, 512, 64],
+                'data2':
+                [512, 64]}, {'data1': [1, 3, 256, 64],
+                             'data2': [256, 64]}, False)
+        self.fetch_list = [out]
+
+    def append_eltwise(self, data1, data2):
+        return fluid.layers.elementwise_add(x=data1, y=data2)
+
+    def test_check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
 class TensorRTSubgraphPassShuffleChannelTest(InferencePassTest):
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..cff8091cd93f8ecfb48e066e11eeda00c1e83a8b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTYoloBoxTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            image_shape = [self.bs, self.channel, self.height, self.width]
+            image = fluid.data(name='image', shape=image_shape, dtype='float32')
+            image_size = fluid.data(
+                name='image_size', shape=[self.bs, 2], dtype='int32')
+            boxes, scores = self.append_yolobox(image, image_size)
+            scores = fluid.layers.reshape(scores, (self.bs, -1))
+            out = fluid.layers.batch_norm(scores, is_test=True)
+
+        self.feeds = {
+            'image': np.random.random(image_shape).astype('float32'),
+            'image_size': np.random.randint(
+                32, 64, size=(self.bs, 2)).astype('int32'),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTYoloBoxTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out, boxes]
+
+    def set_params(self):
+        self.bs = 4
+        self.channel = 255
+        self.height = 64
+        self.width = 64
+        self.class_num = 80
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        self.conf_thresh = .1
+        self.downsample_ratio = 32
+
+    def append_yolobox(self, image, image_size):
+        return fluid.layers.yolo_box(
+            x=image,
+            img_size=image_size,
+            class_num=self.class_num,
+            anchors=self.anchors,
+            conf_thresh=self.conf_thresh,
+            downsample_ratio=self.downsample_ratio)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
index e86273ea1c28ef56cd5786ca41715efe80ea6f5b..e740efa14c575b0d18876da2a9d3fa62472b51cf 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
@@ -198,4 +198,6 @@ class TestBilinearNeighborInterpSame(TestBilinearInterpMKLDNNOp):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b0639289ab286954fe51b7fcd9aee7731deb7e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
@@ -0,0 +1,210 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
+
+
+def bilinear_interp_mkldnn_np(input,
+                              out_h,
+                              out_w,
+                              out_size=None,
+                              actual_shape=None,
+                              data_layout='NCHW'):
+    """bilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for oh in range(out_h):
+        h0 = int(math.floor((oh + 0.5) * in_h / out_h - 0.5))
+        h1 = int(math.ceil((oh + 0.5) * in_h / out_h - 0.5))
+        h0 = max(h0, 0)
+        h1 = min(h1, in_h - 1)
+        Wh = (oh + 0.5) * in_h / out_h - 0.5 - h0
+        for ow in range(out_w):
+            w0 = int(math.floor((ow + 0.5) * in_w / out_w - 0.5))
+            w1 = int(math.ceil((ow + 0.5) * in_w / out_w - 0.5))
+            w0 = max(w0, 0)
+            w1 = min(w1, in_w - 1)
+            Ww = (ow + 0.5) * in_w / out_w - 0.5 - w0
+            input_h0_w0 = input[:, :, h0, w0]
+            input_h1_w0 = input[:, :, h1, w0]
+            input_h0_w1 = input[:, :, h0, w1]
+            input_h1_w1 = input[:, :, h1, w1]
+            out[:, :, oh, ow] = input_h0_w0 * (1 - Wh) * (
+                1 - Ww) + input_h1_w0 * Wh * (1 - Ww) + input_h0_w1 * (
+                    1 - Wh) * Ww + input_h1_w1 * Wh * Ww
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+@skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
+class TestBilinearInterpMKLDNNOp(OpTest):
+    def init_test_case(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "bilinear_interp_v2"
+        self.interp_method = 'bilinear'
+        self._cpu_only = True
+        self.use_mkldnn = True
+        self.input_shape = [1, 1, 2, 2]
+        self.data_layout = 'NCHW'
+        # priority: actual_shape > out_size > scale > out_h & out_w
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 2.0
+        self.out_size = None
+        self.actual_shape = None
+
+        self.init_test_case()
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        scale_h = 0
+        scale_w = 0
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_h = float(self.scale)
+                scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = self.scale[0]
+                scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+
+        if scale_h > 0 and scale_w > 0:
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_mkldnn_np(input_np, out_h, out_w,
+                                              self.out_size, self.actual_shape,
+                                              self.data_layout)
+
+        if isinstance(self.scale, float):
+            self.scale = [self.scale, self.scale]
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'scale': self.scale,
+            'data_layout': self.data_layout,
+            'use_mkldnn': self.use_mkldnn
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestBilinearInterpOpMKLDNNNHWC(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 27
+        self.out_w = 49
+        self.scale = [2.0, 3.0]
+        self.data_layout = 'NHWC'
+
+
+class TestBilinearNeighborInterpMKLDNNCase2(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestBilinearNeighborInterpCase3(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.scale = [0.1, 0.05]
+
+
+class TestBilinearNeighborInterpCase4(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [13.0, 15.0]
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestBilinearNeighborInterpCase5(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([13, 13]).astype("int32")
+
+
+class TestBilinearNeighborInterpCase6(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.0
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestBilinearNeighborInterpSame(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 2.0
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
index c2716420fba376733ad47e0bf70784b89bfb5169..9b7f4b9b860debe00ee29997a01b48c5b113d216 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py
@@ -30,10 +30,9 @@ class TestElementwiseMulBf16MklDNNOp(OpTest):
         self.axis = -1
 
         self.generate_data()
-        self.inputs = {
-            'X': convert_float_to_uint16(self.x),
-            'Y': convert_float_to_uint16(self.y)
-        }
+        self.x_bf16 = convert_float_to_uint16(self.x)
+        self.y_bf16 = convert_float_to_uint16(self.y)
+        self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
         self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
         self.outputs = {'Out': convert_float_to_uint16(self.out)}
 
@@ -46,13 +45,66 @@ class TestElementwiseMulBf16MklDNNOp(OpTest):
         self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad_normal(self):
-        pass
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[
+                np.multiply(self.x, self.y), np.multiply(self.x, self.x)
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
 
     def test_check_grad_ingore_x(self):
-        pass
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[np.multiply(self.y, self.x)],
+            user_defined_grad_outputs=[self.y_bf16])
 
     def test_check_grad_ingore_y(self):
-        pass
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[np.multiply(self.x, self.y)],
+            user_defined_grad_outputs=[self.x_bf16])
+
+
+class TestElementwiseMulBroadcastingBf16MklDNNOp(
+        TestElementwiseMulBf16MklDNNOp):
+    def generate_data(self):
+        self.x = np.random.uniform(1, 2, [1, 2, 3, 100]).astype(np.float32)
+        self.y = np.random.uniform(1, 2, [100]).astype(np.float32)
+        self.out = np.multiply(self.x, self.y)
+
+    # Compute partial sums along all axes but last one
+    def compute_reduced_gradients(self, out_grads):
+        part_sum = np.add.reduceat(out_grads, [0], axis=0)
+        part_sum = np.add.reduceat(part_sum, [0], axis=1)
+        part_sum = np.add.reduceat(part_sum, [0], axis=2)
+        return part_sum.flatten()
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X", "Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[
+                np.multiply(self.x, self.y),
+                self.compute_reduced_gradients(np.multiply(self.x, self.x))
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ["Y"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[
+                self.compute_reduced_gradients(np.multiply(self.x, self.x))
+            ],
+            user_defined_grad_outputs=[self.x_bf16])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
index d66f3dfb8918593dbf3e2c03c75118b5126fc8e2..03dc2421b65b0f56808b4cc4c0ca5dfc7ffc2777 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 from paddle.fluid.tests.unittests.test_elementwise_mul_op import ElementwiseMulOp
+from paddle import enable_static
 
 
 class TestMKLDNNElementwiseMulOp(ElementwiseMulOp):
@@ -51,13 +52,17 @@ class TestMKLDNNElementwiseMulOp4(TestMKLDNNElementwiseMulOp):
     def test_check_grad_normal(self):
         pass
 
-    def test_check_grad_ingore_x(self):
-        pass
-
     def test_check_grad_ingore_y(self):
         pass
 
 
+class TestMKLDNNElementwiseMulOp5(TestMKLDNNElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
+        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
+
+
 ''' INT8 Tests '''
 
 
@@ -140,4 +145,5 @@ class TestUint8Scales(TestInt8Scales):
 
 
 if __name__ == '__main__':
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
index c024ffbdb4b6afd9905ccd3ad4efbe494e69bef2..7320efd259f459875b1ef37d89d6b316dc1efeac 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -19,7 +19,6 @@ import numpy as np
 import struct
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
-from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru
 from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..93dc45f2650f531fef87d6291c48527a93d33db8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py
@@ -0,0 +1,153 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION, fusion_lstm
+
+
+class TestFusionLSTMINT8MKLDNNOp(OpTest):
+    def set_confs(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "fusion_lstm"
+        self.lod = [[2, 3, 5, 4]]
+        self.IC = 3
+        self.OC = 5
+        self.is_reverse = False
+        self.has_initial_state = False
+        self.act_cell = 'tanh'
+        self.act_gate = 'sigmoid'
+        self.act_cand = 'tanh'
+        self.use_peepholes = False  # LSTM u8 doesn't support peepholes
+        self.use_mkldnn = True
+        self.force_fp32_output = False
+        self.error_margin = 1e-5
+        self.set_confs()
+
+        # RNN dimensions
+        T = sum(self.lod[0])
+        N = len(self.lod[0])
+
+        # Input data
+        x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1
+        scale_data = 63.0
+        shift_data = 64.0
+        x_u8 = np.rint(x_f32 * scale_data + shift_data).astype(np.uint8)
+
+        # WeightX/WeightH data
+        wx = np.random.rand(self.IC, 4 * self.OC).astype('float32') * 2 - 1
+        wh = np.random.rand(self.OC, 4 * self.OC).astype('float32') * 2 - 1
+
+        # Calculating weight scales
+        # scales = 127 / max(abs(channel_wise(weightsX + weightsH)))
+        s8_max = 127.0
+
+        scale_weights = s8_max / np.max(
+            np.abs(np.concatenate(
+                [wx[:, :], wh[:, :]], axis=0)), axis=0)
+
+        scale_weights = scale_weights.astype('float')
+
+        if self.use_peepholes:
+            b = np.random.rand(1, 7 * self.OC).astype('float32')
+        else:
+            b = np.random.rand(1, 4 * self.OC).astype('float32')
+        w_b = np.copy(b[:, 0:4 * self.OC])
+        w_c = b[:, 4 * self.OC:] if self.use_peepholes else None
+
+        bx = np.random.normal(size=(1, 4 * self.OC)).astype('float32')
+        b[0, 0:4 * self.OC] += bx[0, :]
+
+        if self.has_initial_state:
+            h0 = np.random.rand(N, self.OC).astype('float32')
+            c0 = np.random.rand(N, self.OC).astype('float32')
+        else:
+            h0 = np.zeros((N, self.OC)).astype('float32')
+            c0 = np.zeros((N, self.OC)).astype('float32')
+
+        hidden_f32, c = fusion_lstm(
+            x_f32, self.lod, wx, bx, h0, c0, wh, w_b, w_c, self.is_reverse,
+            ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+            ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x_u8, self.lod),
+            'WeightX': wx,
+            'WeightH': wh,
+            'Bias': b
+        }
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        if self.force_fp32_output:
+            self.error_margin = 1e-1
+            self.outputs = {
+                'Hidden': (hidden_f32, self.lod),
+                'Cell': (c, self.lod)
+            }
+        else:
+            self.error_margin = 2
+            hidden_u8 = np.rint(hidden_f32 * scale_data + shift_data).astype(
+                np.uint8)
+            self.outputs = {
+                'Hidden': (hidden_u8, self.lod),
+                'Cell': (c, self.lod)
+            }
+
+        self.attrs = {
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand,
+            'is_reverse': self.is_reverse,
+            'use_peepholes': self.use_peepholes,
+            'use_mkldnn': self.use_mkldnn,
+            'force_fp32_output': self.force_fp32_output,
+            'Scale_data': scale_data,
+            'Shift_data': shift_data,
+            'Scale_weights': scale_weights
+        }
+
+    def test_check_output(self):
+        for use_seq in {True, False}:
+            self.attrs['use_seq'] = use_seq
+            self.check_output(
+                check_dygraph=False,
+                no_check_set=["Cell"],
+                atol=self.error_margin)
+
+
+class TestFusionLSTMINT8MKLDNNOp2(TestFusionLSTMINT8MKLDNNOp):
+    def set_confs(self):
+        self.force_fp32_output = True
+
+
+class TestFusionLSTMINT8MKLDNNOp4(TestFusionLSTMINT8MKLDNNOp):
+    def set_confs(self):
+        self.is_reverse = True
+
+
+class TestFusionLSTMINT8MKLDNNOp5(TestFusionLSTMINT8MKLDNNOp):
+    def set_confs(self):
+        self.has_initial_state = True
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
index 1e4bfd5f0cf017359a88d3b4c3754becb61ab77e..9f39826cb3ed2875993452269a66559ec2e84782 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
@@ -163,4 +163,6 @@ class TestNearestNeighborInterpSame(TestNearestInterpMKLDNNOp):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b608ca3af2f3660347278135e1118bb3a3c817d5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
+
+
+def nearest_neighbor_interp_mkldnn_np(X,
+                                      out_h,
+                                      out_w,
+                                      out_size=None,
+                                      actual_shape=None,
+                                      data_layout='NCHW'):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+
+    n, c, in_h, in_w = X.shape
+
+    fh = fw = 0.0
+    if (out_h > 1):
+        fh = out_h * 1.0 / in_h
+    if (out_w > 1):
+        fw = out_w * 1.0 / in_w
+
+    out = np.zeros((n, c, out_h, out_w))
+
+    for oh in range(out_h):
+        ih = int(round((oh + 0.5) / fh - 0.5))
+        for ow in range(out_w):
+            iw = int(round((ow + 0.5) / fw - 0.5))
+            out[:, :, oh, ow] = X[:, :, ih, iw]
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(X.dtype)
+
+
+@skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
+class TestNearestInterpV2MKLDNNOp(OpTest):
+    def init_test_case(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "nearest_interp_v2"
+        self.interp_method = 'nearest'
+        self._cpu_only = True
+        self.use_mkldnn = True
+        self.input_shape = [1, 1, 2, 2]
+        self.data_layout = 'NCHW'
+        # priority: actual_shape > out_size > scale > out_h & out_w
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = [2.0, 3.0]
+        self.out_size = None
+        self.actual_shape = None
+
+        self.init_test_case()
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        scale_h = 0
+        scale_w = 0
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_h = float(self.scale)
+                scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = self.scale[0]
+                scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+
+        if scale_h > 0 and scale_w > 0:
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_mkldnn_np(
+            input_np, out_h, out_w, self.out_size, self.actual_shape,
+            self.data_layout)
+
+        if isinstance(self.scale, float):
+            self.scale = [self.scale]
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'scale': self.scale,
+            'data_layout': self.data_layout,
+            'use_mkldnn': self.use_mkldnn
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestNearestInterpOpV2MKLDNNNHWC(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 27
+        self.out_w = 49
+        self.scale = [2.0, 3.0]
+        self.data_layout = 'NHWC'
+
+
+class TestNearestNeighborInterpV2MKLDNNCase2(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestNearestNeighborInterpV2MKLDNNCase3(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.scale = [0.1, 0.05]
+
+
+class TestNearestNeighborInterpV2MKLDNNCase4(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [13.0, 15.0]
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestNearestNeighborInterpV2MKLDNNSame(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d7ab4f6b336993f84d4932bd0da7b433dbb6b2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
@@ -0,0 +1,209 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+import paddle
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+@unittest.skipIf(core.is_compiled_with_cuda(),
+                 "core is compiled with CUDA which has no BF implementation")
+class TestReduceSumDefaultBF16OneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.outputs = {'Out': self.x_fp32.sum(axis=0)}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+    def calculate_grads(self):
+        tmp_tensor = np.zeros(self.x_fp32.shape).astype("float32")
+
+        prod_of_reduced_dims = self.inputs['X'].shape[0]
+        axis = 0
+
+        if "dim" in self.attrs:
+            prod_of_reduced_dims = 1
+            axis = tuple(self.attrs['dim'])
+            for i in range(len(axis)):
+                ax = axis[i]
+                if axis[i] < 0:
+                    ax = len(axis) + axis[i]
+                prod_of_reduced_dims *= self.inputs['X'].shape[ax]
+
+        if 'reduce_all' in self.attrs:
+            if self.attrs['reduce_all'] is True:
+                axis = None
+                prod_of_reduced_dims = np.asarray(self.inputs['X'].shape).prod()
+
+        keepdim = False
+        if 'keep_dim' in self.attrs:
+            keepdim = True
+
+        self.grad_Out = self.x_fp32.sum(axis=axis, keepdims=keepdim)
+        self.grad_Out = np.atleast_1d(self.grad_Out)
+        self.grad_X = tmp_tensor + self.grad_Out  # broadcast grad
+
+        if self.op_type == 'reduce_mean':
+            self.grad_X /= prod_of_reduced_dims
+
+
+class TestReduceDefaultWithGradBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.grad_X],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.grad_Out)])
+
+
+class TestReduceSum4DReduceAllDimAttributeBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.normal(size=(2, 3, 5, 6)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
+        self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))}
+
+
+class TestReduceSum4DReduceAllWithoutReduceAllAttributeNegativeDimsBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.normal(size=(4, 7, 6, 6)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [-1, -2, -3, -4]}
+        self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))}
+
+
+class TestReduceSum5DReduceAllKeepDimsBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.normal(size=(2, 5, 3, 2, 5)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {'Out': self.x_fp32.sum(keepdims=self.attrs['keep_dim'])}
+
+
+class TestReduceSum4DReduceAllBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.normal(size=(4, 5, 4, 5)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.sum()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax3DBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'dim': [-1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax4DNegativeAndPositiveDimsBF16OneDNNOp(
+        TestReduceSumDefaultBF16OneDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10, 9)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))}
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMin3DBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'dim': [2], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.min(axis=tuple(self.attrs['dim']))}
+
+
+class TestReduceMean3DBF16OneDNNOp(TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.x_fp32.sum(axis=0) / self.x_fp32.shape[0]}
+
+
+class TestReduceMean4DBF16OneDNNOp(TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 3, 5)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1]}
+        self.outputs = {
+            'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim'])) /
+            (self.x_fp32.shape[0] * self.x_fp32.shape[1])
+        }
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..46ee2a14a2018549105b41b574344c636dcf6dce
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
@@ -0,0 +1,169 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+import paddle.fluid as fluid
+import paddle
+
+
+class TestReduceSumDefaultOneDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestReduceDefaultWithGradOneDNNOp(TestReduceSumDefaultOneDNNOp):
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceSum4DOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [2]}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestReduceSum4DReduceAllDimAttributeBF16OneDNNOp(
+        TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 10, 5, 3)).astype("float32")}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestReduceSum5DKeepDimsOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")}
+        self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceSum5DReduceAllKeepDimsOneDNNOp(
+        TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((2, 5, 3, 2, 2)).astype("float32")}
+        self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(keepdims=self.attrs['keep_dim'])
+        }
+
+
+class TestReduceSum4DReduceAllOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax3DOneDNNOp(TestReduceSumDefaultOneDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [-1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_max is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMax4DNegativeAndPositiveDimsOneDNNOp(
+        TestReduceSumDefaultOneDNNOp):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10, 9)).astype("float32")}
+        self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim']))
+        }
+
+
+@skip_check_grad_ci(
+    reason="reduce_min is discontinuous non-derivable function,"
+    " its gradient check is not supported by unittest framework.")
+class TestReduceMin3DOneDNNOp(TestReduceSumDefaultOneDNNOp):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [2], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestReduceMean3DOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
+        self.attrs = {'dim': [0], 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=0) / self.inputs['X'].shape[0]
+        }
+
+
+class TestReduceMean4DReduceAllOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.inputs = {'X': np.random.random((5, 6, 8, 10)).astype("float32")}
+        self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {
+            'Out':
+            self.inputs['X'].sum() / np.asarray(self.inputs['X'].shape).prod()
+        }
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/new_group.py b/python/paddle/fluid/tests/unittests/new_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb7beeee1df2e8b811ec770e1aa40852a4c8c7ef
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/new_group.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+
+
+class TestNewGroupAPI(object):
+    def __init__(self):
+        paddle.distributed.init_parallel_env()
+        d1 = np.array([1, 2, 3])
+        d2 = np.array([2, 3, 4])
+        self.tensor1 = paddle.to_tensor(d1)
+        self.tensor2 = paddle.to_tensor(d2)
+
+    def test_all(self):
+        gp = paddle.distributed.new_group([0, 1])
+        print("test new group api ok")
+
+        tmp = np.array([0, 0, 0])
+        result = paddle.to_tensor(tmp)
+        paddle.distributed.scatter(
+            result, [self.tensor2, self.tensor1],
+            src=0,
+            group=gp,
+            use_calc_stream=True)
+        if gp.rank == 0:
+            assert np.array_equal(result, self.tensor2)
+        elif gp.rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test scatter api ok")
+
+        paddle.distributed.broadcast(
+            result, src=1, group=gp, use_calc_stream=True)
+        assert np.array_equal(result, self.tensor1)
+        print("test broadcast api ok")
+
+        paddle.distributed.reduce(result, dst=0, group=gp, use_calc_stream=True)
+        if gp.rank == 0:
+            assert np.array_equal(result,
+                                  paddle.add(self.tensor1, self.tensor1))
+        elif gp.rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test reduce api ok")
+
+        paddle.distributed.all_reduce(result, use_calc_stream=True)
+        assert np.array_equal(
+            result,
+            paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1))
+        print("test all_reduce api ok")
+
+        paddle.distributed.wait(result, gp, use_calc_stream=True)
+        paddle.distributed.wait(result, gp, use_calc_stream=False)
+        print("test wait api ok")
+
+        result = []
+        paddle.distributed.all_gather(
+            result, self.tensor1, group=gp, use_calc_stream=True)
+        assert np.array_equal(result[0], self.tensor1)
+        assert np.array_equal(result[1], self.tensor1)
+        print("test all_gather api ok")
+
+        paddle.distributed.barrier(group=gp)
+        print("test barrier api ok")
+
+        return
+
+
+if __name__ == "__main__":
+    gpt = TestNewGroupAPI()
+    gpt.test_all()
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f71e04c09aa38b8cf7b3a167b84d4dc0e6cc3ec7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py b/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a85bd4fccc3a707425cbe266b18309a8485f6e4a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/collective_identity_op_npu.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base_npu import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveIdentity(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_identity",
+                inputs={'X': tindata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveIdentity, "identity", 0)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5175bdb19c7e5bc2e981b7f76fc2b7471d73d6f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
@@ -0,0 +1,122 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAccuracy(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
+        label = pred.copy()
+        accuracy = np.array([1]).astype(self.dtype)
+        correct = np.array([11 * 1]).astype(self.dtype)
+        total = np.array([11 * 1]).astype(self.dtype)
+
+        self.inputs = {
+            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+        }
+        self.outputs = {
+            "Accuracy": accuracy,
+            "Correct": correct,
+            "Total": total
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestAccuracy2(TestAccuracy):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        pred = np.random.uniform(1, 2, [11, 1]).astype(self.dtype)
+        label = np.random.uniform(4, 5, [11, 1]).astype(self.dtype)
+        accuracy = np.array([0]).astype(self.dtype)
+        correct = np.array([11 * 0]).astype(self.dtype)
+        total = np.array([11 * 1]).astype(self.dtype)
+
+        self.inputs = {
+            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+        }
+        self.outputs = {
+            "Accuracy": accuracy,
+            "Correct": correct,
+            "Total": total
+        }
+
+
+class TestAccuracy3(TestAccuracy):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.set_npu()
+        self.init_dtype()
+        np.random.seed(SEED)
+        a = np.random.randint(1, 2, [5, 1])
+        b = np.random.randint(0, 1, [5, 1])
+        pred = np.row_stack((a, b)).astype(self.dtype)
+        label = np.random.randint(1, 2, [10, 1]).astype(self.dtype)
+        accuracy = np.array([0.5]).astype(self.dtype)
+        correct = np.array([5]).astype(self.dtype)
+        total = np.array([10 * 1]).astype(self.dtype)
+
+        self.inputs = {
+            "Out": OpTest.np_dtype_to_fluid_dtype(pred),
+            "Label": OpTest.np_dtype_to_fluid_dtype(label),
+            "Indices": OpTest.np_dtype_to_fluid_dtype(pred)
+        }
+        self.outputs = {
+            "Accuracy": accuracy,
+            "Correct": correct,
+            "Total": total
+        }
+
+
+class TestAccuracyInt(TestAccuracy):
+    def init_dtype(self):
+        self.dtype = np.int
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec616070b63abf8ca83f3ca561c14692ac39d89c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -0,0 +1,284 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from test_adam_op import adam_step
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAdam(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAdamWithEpsilonTensor(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            adam = fluid.optimizer.Adam(learning_rate=0.01)
+            adam.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNetWithEpsilonTensor(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            beta1_init = 0.9
+            beta2_init = 0.999
+            epsilon_init = 1e-8
+            beta1 = fluid.layers.create_global_var(
+                shape=[1],
+                value=float(beta1_init),
+                dtype='float32',
+                persistable=True,
+                name="beta1")
+            beta2 = fluid.layers.create_global_var(
+                shape=[1],
+                value=float(beta2_init),
+                dtype='float32',
+                persistable=True,
+                name="beta2")
+            epsilon = fluid.layers.create_global_var(
+                shape=[1],
+                value=float(epsilon_init),
+                dtype='float32',
+                persistable=True,
+                name="epsilon")
+            adam = fluid.optimizer.Adam(
+                learning_rate=0.01, beta1=beta1, beta2=beta2, epsilon=epsilon)
+            adam.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8828892dca3ccc04dcf926cf5462a282fd442c51
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -0,0 +1,147 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.contrib.mixed_precision.amp_nn import check_finite_and_unscale
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCheckFiniteAndUnscale(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with program_guard(main_program):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
+            float_status = paddle.static.data(
+                name="status", shape=[8], dtype='float32')
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            c = paddle.fluid.layers.elementwise_div(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [c], scale, float_status=float_status)
+
+        return main_program, out, found_inf, float_status
+
+    def run_prog(self, a, b, scale):
+        main_program, out, found_inf, float_status = self.get_prog()
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        out_, founf_inf_, float_status_ = exe.run(
+            main_program,
+            feed={"a": a,
+                  "b": b,
+                  "scale": scale},
+            fetch_list=[out, found_inf, float_status])
+        print(float_status_)
+        return out_, founf_inf_
+
+    def test_contains_nan(self):
+        a = np.zeros((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(found_inf[0])
+
+    def test_contains_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(found_inf[0])
+
+    def test_not_contains_nan_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.ones((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(np.allclose(out, (a / b) / scale[0]))
+        self.assertFalse(found_inf[0])
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCheckFiniteAndUnscaleClearFloatStatus(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with program_guard(main_program):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
+            float_status = paddle.static.data(
+                name="status", shape=[8], dtype='float32')
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            c = paddle.fluid.layers.elementwise_div(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [c], scale, float_status=float_status)
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            d = paddle.fluid.layers.elementwise_add(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [d], scale, float_status=float_status)
+
+        return main_program, out, found_inf, float_status
+
+    def run_prog(self, a, b, scale):
+        main_program, out, found_inf, float_status = self.get_prog()
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        out_, founf_inf_, float_status_ = exe.run(
+            main_program,
+            feed={"a": a,
+                  "b": b,
+                  "scale": scale},
+            fetch_list=[out, found_inf, float_status])
+        print(float_status_)
+        return out_, founf_inf_
+
+    def test_not_contains_nan_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(np.allclose(out, (a + b) / scale[0]))
+        self.assertFalse(found_inf[0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/setup_install.py b/python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py
similarity index 52%
rename from python/paddle/fluid/tests/custom_op/setup_install.py
rename to python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py
index 18fbfbaf8b64b328be108a837c54c55492bebe9e..9ea52a88d989769063dd12f3332120c90e250e73 100644
--- a/python/paddle/fluid/tests/custom_op/setup_install.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_c_identity_npu.py
@@ -11,19 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
 import os
 
-from utils import paddle_includes, extra_compile_args
-from paddle.utils.cpp_extension import CUDAExtension, setup
-from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
+from test_collective_base_npu import TestDistBase
+
+paddle.enable_static()
+
+
+class TestIdentityOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_identity(self, col_type="identity"):
+        dist_env = os.environ
+        self.check_with_place(
+            "collective_identity_op_npu.py", col_type, need_envs=dist_env)
 
-# switch to old custom op method
-use_new_custom_op_load_method(False)
 
-setup(
-    name='custom_relu2',
-    ext_modules=CUDAExtension(  # test for not specific name here.
-        sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc',
-                 'relu_op3.cu'],  # test for multi ops
-        include_dirs=paddle_includes,
-        extra_compile_args=extra_compile_args))
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
new file mode 100755
index 0000000000000000000000000000000000000000..ae48866b7b969d5e7f2d7bf0dc9ed93c46aed4bb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
@@ -0,0 +1,99 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCast1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float16')}
+
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP16)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('float16')}
+        self.outputs = {'Out': ipt.astype('float32')}
+
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP16),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+class TestCast3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('int32')}
+        self.outputs = {'Out': ipt.astype('int32')}
+
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.INT32),
+            'out_dtype': int(core.VarDesc.VarType.INT32)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba2b6329a2564e31098f8994f2e2d4ef192f01f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_collective_base_npu.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import unittest
+import time
+import argparse
+import os
+import six
+import sys
+import subprocess
+import traceback
+import functools
+import pickle
+from contextlib import closing
+from six import string_types
+import paddle.fluid as fluid
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+
+
+class TestCollectiveRunnerBase(object):
+    def get_model(self, train_prog, startup_prog):
+        raise NotImplementedError(
+            "get model should be implemented by child class.")
+
+    def wait_server_ready(self, endpoints):
+        assert not isinstance(endpoints, string_types)
+        while True:
+            all_ok = True
+            not_ready_endpoints = []
+            for ep in endpoints:
+                ip_port = ep.split(":")
+                with closing(
+                        socket.socket(socket.AF_INET,
+                                      socket.SOCK_STREAM)) as sock:
+                    sock.settimeout(2)
+                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                    if result != 0:
+                        all_ok = False
+                        not_ready_endpoints.append(ep)
+            if not all_ok:
+                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
+                sys.stderr.write("not ready endpoints:" + str(
+                    not_ready_endpoints) + "\n")
+                sys.stderr.flush()
+                time.sleep(3)
+            else:
+                break
+
+#endpoints should be ["ip1:port1","ip2:port2"]
+
+    def initCommunicator(self, program, rank, nranks, wait_port,
+                         current_endpoint, endpoints):
+        other_endpoints = endpoints[:]
+        other_endpoints.remove(current_endpoint)
+        if rank == 0 and wait_port:
+            self.wait_server_ready(other_endpoints)
+        block = program.global_block()
+        hccl_id_var = block.create_var(
+            name=nameGen.generate('hccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        block.append_op(
+            type='c_gen_hccl_id',
+            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
+            outputs={},
+            attrs={
+                'rank': rank,
+                'ring_id': self.global_ring_id,
+                'device_id': int(os.getenv("FLAGS_selected_npus")),
+                'rank_ids': nranks
+            })
+
+    def run_trainer(self, args):
+        train_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        endpoints = args["endpoints"].split(",")
+        rank = args["trainerid"]
+        current_endpoint = args["currentendpoint"]
+        nranks = 2
+        self.initCommunicator(startup_prog, rank, nranks, True,
+                              current_endpoint, endpoints)
+        self.rank = rank
+        result = self.get_model(train_prog, startup_prog)
+        device_id = int(os.getenv("FLAGS_selected_npus", "0"))
+        place = fluid.NPUPlace(device_id)
+        exe = fluid.Executor(place)
+        exe.run(startup_prog)
+        np.random.seed(os.getpid())
+        indata = np.random.random((10, 1000))
+        out = exe.run(train_prog,
+                      feed={'tindata': indata},
+                      fetch_list=[result.name])
+        if six.PY2:
+            print(pickle.dumps(out))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out))
+
+
+def runtime_main(test_class, col_type, sub_type):
+    args = {}
+    model = test_class()
+    args["deviceid"] = os.getenv("FLAGS_selected_npus")
+    args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
+    args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
+    args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
+    args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    args["col_type"] = col_type
+    model.run_trainer(args)
+
+
+import paddle.compat as cpt
+import socket
+from contextlib import closing
+
+
+class TestDistBase(unittest.TestCase):
+    def setUp(self):
+        self._port_set = set()
+        self._trainers = 2
+        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
+            self._find_free_port(), self._find_free_port())
+        self._python_interp = sys.executable
+
+    def _find_free_port(self):
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
+                s.bind(('', 0))
+                return s.getsockname()[1]
+
+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
+    def _run_cluster(self, model_file, envs):
+        worker_endpoints = self._ps_endpoints.split(",")
+        w0_ep, w1_ep = worker_endpoints
+        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        env0 = {
+            "FLAGS_selected_npus": "0",
+            "PADDLE_TRAINER_ID": "0",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w0_ep,
+        }
+
+        env1 = {
+            "FLAGS_selected_npus": "1",
+            "PADDLE_TRAINER_ID": "1",
+            "PADDLE_TRAINERS_NUM": "2",
+            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+            "PADDLE_CURRENT_ENDPOINT": w1_ep,
+        }
+        #update environment
+        env0.update(envs)
+        env1.update(envs)
+        tr_cmd = "%s %s"
+        tr0_cmd = tr_cmd % (self._python_interp, model_file)
+        tr1_cmd = tr_cmd % (self._python_interp, model_file)
+        tr0_pipe = open("/tmp/tr0_err.log", "wb")
+        tr1_pipe = open("/tmp/tr1_err.log", "wb")
+        #print(tr0_cmd) 
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0)
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1)
+
+        tr0_out, tr0_err = tr0_proc.communicate()
+        tr1_out, tr1_err = tr1_proc.communicate()
+        sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
+        sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
+        # close trainer file
+        tr0_pipe.close()
+        tr1_pipe.close()
+        return pickle.loads(tr0_out), pickle.loads(
+            tr1_out), tr0_proc.pid, tr1_proc.pid
+
+    def check_with_place(self, model_file, col_type, need_envs={}):
+
+        tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file, need_envs)
+        np.random.seed(pid0)
+        input1 = np.random.random((10, 1000))
+        np.random.seed(pid1)
+        input2 = np.random.random((10, 1000))
+        if col_type == "identity":
+            need_result1 = input1
+            need_result2 = input2
+            self.assertTrue(np.allclose(tr0_out, need_result1, rtol=0, atol=0))
+            self.assertTrue(np.allclose(tr1_out, need_result2, rtol=0, atol=0))
diff --git a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..54a2c1e7163a9f122927bc2781eb1a13a84a124e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
@@ -0,0 +1,145 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestEqual(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "equal"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = x == y  # all elements are not equal
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLessthan(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "less_than"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = x < y
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestEqual2(TestEqual):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "equal"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = x.copy()
+        y[0][1] = 1
+        out = x == y  # all elements are equal, except position [0][1]
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+
+class TestLessthan2(TestLessthan):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "less_than"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = x.copy()
+        y[0][1] = 1
+        out = x < y  # all elements are equal, except position [0][1]
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.outputs = {'Out': out}
+
+
+class TestEqual2FP16(TestEqual2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestEqual2Int(TestEqual2):
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+class TestLessthan2FP16(TestLessthan2):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2ec1c7a9eef6ebc516cdc94062dd628c77a1f81
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
@@ -0,0 +1,115 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestConcat(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "concat"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_test_data()
+
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+        self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+        self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+        self.axis = 0
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['x0', 'x2'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(
+            self.place, ['x1'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(
+            self.place, ['x2'], 'Out', check_dygraph=False)
+
+
+class TestConcatFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "concat"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_test_data()
+
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {'axis': self.axis}
+        if self.axis < 0:
+            self.actual_axis = self.axis + len(self.x0.shape)
+            self.actual_axis = self.actual_axis if self.actual_axis > 0 else 0
+        else:
+            self.actual_axis = self.axis
+
+        self.outputs = {
+            'Out': np.concatenate(
+                (self.x0, self.x1, self.x2), axis=self.actual_axis)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def init_test_data(self):
+        self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
+        self.x1 = np.random.random((2, 4, 50)).astype(self.dtype)
+        self.x2 = np.random.random((3, 4, 50)).astype(self.dtype)
+        self.axis = 0
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a82157faaec41d9abaffa9b68e3a3e80b6b2fb3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseAddOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_add"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X', 'Y'],
+            'Out',
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'],
+            'Out',
+            no_grad_set=set("Y"),
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAddAPI(unittest.TestCase):
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.add(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+
+            x_reshape = paddle.reshape(x, [3, 1])
+            y_reshape = paddle.reshape(y, [3, 1])
+            z = paddle.add(x_reshape, y_reshape)
+            z = paddle.reshape(z, shape=[3])
+
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_value, y_value, z_value = exe.run(feed={"x": x_np,
+                                                      "y": y_np},
+                                                fetch_list=[x, y, z])
+
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual(
+                (x_value == x_np).all(),
+                True,
+                msg="x_value = {}, but expected {}".format(x_value, x_np))
+            self.assertEqual(
+                (y_value == y_np).all(),
+                True,
+                msg="y_value = {}, but expected {}".format(y_value, y_np))
+            self.assertEqual(
+                (z_value == z_expected).all(),
+                True,
+                msg="z_value = {}, but expected {}".format(z_value, z_expected))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAddError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            self.assertRaises(TypeError, paddle.add, x1, y1)
+
+            # the input dtype must be float16 or float32 or float64 or int32 or int64
+            x2 = paddle.static.data(
+                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = paddle.static.data(
+                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, paddle.add, x2, y2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ae2678d10b47c8998882e3ee00d177e86236a06
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
@@ -0,0 +1,183 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseDiv(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_div"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.divide(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X', 'Y'],
+            'Out',
+            max_relative_error=0.007,
+            check_dygraph=False)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'],
+            'Out',
+            max_relative_error=0.007,
+            no_grad_set=set("X"),
+            check_dygraph=False)
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', no_grad_set=set("Y"), check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseDivFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_div"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.divide(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseDivNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        b_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        c_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        d_np = np.random.uniform(1, 2, [32, 32]).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            c = paddle.static.data(name="c", shape=[32, 32], dtype='float32')
+            d = paddle.static.data(name="d", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            e = paddle.multiply(a, b)
+            f = paddle.multiply(c, d)
+            f.stop_gradient = True
+            g = fluid.layers.elementwise_div(e, f)
+
+            fc_1 = fluid.layers.fc(input=g, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..93538e938670f07ab78f33e0c9749b702854b7d6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
@@ -0,0 +1,67 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseFloorDiv(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_floordiv"
+        self.set_npu()
+        self.init_dtype()
+        self.init_input_output()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_input_output(self):
+        self.x = np.random.uniform(1, 1000, [10, 10]).astype(self.dtype)
+        self.y = np.random.uniform(1, 1000, [10, 10]).astype(self.dtype)
+        self.out = np.floor_divide(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = "int64"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseFloorDiv2(TestElementwiseFloorDiv):
+    def init_dtype(self):
+        self.dtype = "int32"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4d9c7285b2b556f76f94241cc0b9373319f7753
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMin(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_min"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.minimum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Min grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMinFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_min"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.minimum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMinNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.minimum(a, b)
+
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bfb7e033e7ea454223c683877bb30f02506be75
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
@@ -0,0 +1,171 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMul(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.multiply(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Mul grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMulFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.multiply(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMulNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        c_np = np.random.random(size=(32, 32)).astype('float32')
+        d_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            c = paddle.static.data(name="c", shape=[32, 32], dtype='float32')
+            d = paddle.static.data(name="d", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            e = paddle.multiply(a, b)
+            f = paddle.multiply(c, d)
+            f.stop_gradient = True
+            g = paddle.multiply(e, f)
+
+            fc_1 = fluid.layers.fc(input=g, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..862c546b8e05ebe2046e9c5aeb52178fa47f59ab
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwisePow(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Pow grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwisePowFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.power(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwisePowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.pow(a, b)
+
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c6c7b46f49f2725b646202998095adef3a65e63
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
@@ -0,0 +1,224 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseSubOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_sub"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = 0
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): For grad tests, OpTest raises FatalError:Segmentation fault
+    #  when call op.run, which may be caused by system environment exception
+    #  and the exact cause has not be located.
+    # def test_check_grad_normal(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X', 'Y'],
+    #         'Out',
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_x(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['Y'],
+    #         'Out',
+    #         no_grad_set=set("X"),
+    #         max_relative_error=0.006,
+    #         check_dygraph=False)
+    #
+    # def test_check_grad_ingore_y(self):
+    #     self.check_grad_with_place(
+    #         self.place, ['X'],
+    #         'Out',
+    #         no_grad_set=set("Y"),
+    #         max_relative_error=0.006,check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractAPI(unittest.TestCase):
+    def test_name(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[2, 3], dtype="float32")
+            y = paddle.static.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = paddle.subtract(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_static(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+
+            x_reshape = paddle.reshape(x, [3, 1])
+            y_reshape = paddle.reshape(y, [3, 1])
+            z = paddle.subtract(x_reshape, y_reshape)
+            z = paddle.reshape(z, shape=[3])
+
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_value, y_value, z_value = exe.run(feed={"x": x_np,
+                                                      "y": y_np},
+                                                fetch_list=[x, y, z])
+
+            z_expected = np.array([1., -2., 2.])
+            self.assertEqual(
+                (x_value == x_np).all(),
+                True,
+                msg="x_value = {}, but expected {}".format(x_value, x_np))
+            self.assertEqual(
+                (y_value == y_np).all(),
+                True,
+                msg="y_value = {}, but expected {}".format(y_value, y_np))
+            self.assertEqual(
+                (z_value == z_expected).all(),
+                True,
+                msg="z_value = {}, but expected {}".format(z_value, z_expected))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractError(unittest.TestCase):
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            self.assertRaises(TypeError, paddle.subtract, x1, y1)
+
+            # the input dtype must be float16 or float32 or float64 or int32 or int64
+            x2 = paddle.static.data(
+                name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = paddle.static.data(
+                name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, paddle.subtract, x2, y2)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSubtractNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            c = paddle.assign(b)
+            z = paddle.subtract(sum, c)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        npu_pred, npu_loss = self._test(True)
+        cpu_pred, cpu_loos = self._test(False)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loos))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6a84d3be5c100c7324f2ed62ce8c934f4318b7f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -0,0 +1,144 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpand(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "expand"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        out = np.tile(x, [1, 10, 1])
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'expand_times': [1, 10, 1]}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpandV2(TestExpand):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "expand"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.randn(3, 1, 7).astype(self.dtype)
+        out = np.tile(x, [1, 10, 1])
+        expand_times = np.array([1, 10, 1]).astype(np.int32)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'ExpandTimes': OpTest.np_dtype_to_fluid_dtype(expand_times)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpandFp16(TestExpand):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestExpandNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 1)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 1], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            res = paddle.fluid.layers.expand(a, [1, 32])
+            loss = res.sum()
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        for epoch in range(100):
+
+            loss_res = exe.run(main_prog,
+                               feed={"a": a_np,
+                                     "label": label_np},
+                               fetch_list=[loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Loss: {}".format(epoch, loss))
+
+        return loss_res
+
+    def test_npu(self):
+        cpu_loss = self._test(False)
+        npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e619bfd11fb901994ad3a91187a716b014dab41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestFillConstant(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+        self.init_dtype()
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 3.8}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestFillConstantInt(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 1,
+            'dtype': core.VarDesc.VarType.INT32
+        }
+        self.outputs = {'Out': np.full((123, 92), 1).astype(self.dtype)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.int32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestFillConstantFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {
+            'shape': [123, 92],
+            'value': 1.0,
+            'dtype': core.VarDesc.VarType.FP16
+        }
+        self.outputs = {'Out': np.full((123, 92), 1.0).astype(self.dtype)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..008422ffd21188327fa938734928a0dc62187824
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
@@ -0,0 +1,177 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+def gather_numpy(x, index, axis):
+    x_transpose = np.swapaxes(x, 0, axis)
+    tmp_gather = x_transpose[index, ...]
+    gather = np.swapaxes(tmp_gather, 0, axis)
+    return gather
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGatherOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "gather"
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            'X': xnp,
+            'Index': np.array(self.index).astype(self.index_type)
+        }
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'],
+            'Out',
+            max_relative_error=0.006,
+            check_dygraph=False)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 20)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCase1(TestGatherOp):
+    def config(self):
+        """
+        For one dimension input
+        """
+        self.x_shape = (100)
+        self.x_type = "float32"
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class API_TestGather(unittest.TestCase):
+    def test_out1(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data1 = fluid.layers.data('data1', shape=[-1, 2], dtype='float32')
+            index = fluid.layers.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.fluid.layers.gather(data1, index)
+            place = paddle.NPUPlace(0)
+            exe = fluid.Executor(place)
+            input = np.array([[1, 2], [3, 4], [5, 6]])
+            index_1 = np.array([1, 2])
+            result, = exe.run(feed={"data1": input,
+                                    "index": index_1},
+                              fetch_list=[out])
+            expected_output = np.array([[3, 4], [5, 6]])
+        self.assertTrue(np.allclose(result, expected_output))
+
+    def test_out2(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.fluid.data('x', shape=[-1, 2], dtype='float32')
+            index = paddle.fluid.data('index', shape=[-1, 1], dtype='int32')
+            out = paddle.gather(x, index)
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype('float32')
+            index_np = np.array([1, 1]).astype('int32')
+            result, = exe.run(feed={"x": x_np,
+                                    "index": index_np},
+                              fetch_list=[out])
+            expected_output = gather_numpy(x_np, index_np, axis=0)
+        self.assertTrue(np.allclose(result, expected_output))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGatherGrad(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(8192, 768)).astype('float32')
+        index_np = np.random.randint(0, 8192, size=(1232, 1)).astype('int32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[8192, 768], dtype='float32')
+            index = paddle.static.data(
+                name="index", shape=[1232, 1], dtype='int32')
+            a.stop_gradient = False
+            b = paddle.gather(a, index)
+
+            loss = fluid.layers.reduce_mean(b)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={"a": a_np,
+                                               "index": index_np},
+                                         fetch_list=[b, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res[0]))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        npu_pred, npu_loss = self._test(True)
+        cpu_pred, cpu_loss = self._test(False)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..efa1918206b035005dd12939b7001933266c107c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
@@ -0,0 +1,160 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+from scipy import special
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+def np_gelu(x):
+    y = 0.5 * x * (1 + special.erf(x / np.sqrt(2)))
+    return y
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGelu(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "gelu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np_gelu(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGeluFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "gelu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np_gelu(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestGeluNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = fluid.layers.gelu(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-3))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-3))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e2e8f944b84c6f2c97b7d834e6f2b6b7511583d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
@@ -0,0 +1,119 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+NPUPlace = 0
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestIncrement(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(NPUPlace)
+        self.op_type = "increment"
+        self.init_dtype()
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)),
+        }
+
+        self.attrs = {"Step": 1}
+        self.outputs = {'Out': np.array([2])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.int64
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestIncrementFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(NPUPlace)
+        self.op_type = "increment"
+        self.init_dtype()
+
+        self.inputs = {
+            'X':
+            OpTest.np_dtype_to_fluid_dtype(np.array([1]).astype(self.dtype)),
+        }
+        self.pre_input_id = id(self.inputs['X'])
+
+        self.attrs = {"Step": 1}
+        self.outputs = {'Out': np.array([2])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestIncrementInplace(unittest.TestCase):
+    def test_npu(self):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.array([1]).astype('float32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[1], dtype='float32')
+            b = fluid.layers.increment(a)
+
+        place = paddle.NPUPlace(NPUPlace)
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        b_value = exe.run(main_prog, feed={"a": a_np, }, fetch_list=[b])
+
+        print('input a id is : {}'.format(id(a)))
+        print('input b id is : {}'.format(id(b)))
+
+        self.assertEqual(id(a), id(b))
+        self.assertEqual(b_value[0], 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..d447dfb8d4d031e6f29fdfedab285066d4dea565
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
@@ -0,0 +1,203 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+from functools import reduce
+from operator import mul
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from test_layer_norm_op import _reference_layer_norm_naive, _reference_layer_norm_grad
+
+paddle.enable_static()
+
+SEED = 2021
+EPOCH = 100
+
+from op_test import _set_use_system_allocator
+
+_set_use_system_allocator(False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        self.use_cudnn = True
+        self.set_npu()
+        self.init_dtype()
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.atol = 1e-4
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(
+            np.allclose(
+                np.array(tensor).astype(np_array.dtype), np_array, atol=atol),
+            msg)
+
+    def check_forward_backward(self,
+                               shape,
+                               begin_norm_axis,
+                               has_scale=True,
+                               has_bias=True,
+                               y_grad_scale=1.0,
+                               use_mkldnn=False):
+        def test_with_place(place,
+                            shape,
+                            begin_norm_axis,
+                            use_mkldnn=use_mkldnn):
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+            scale_shape = [D]
+
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(self.dtype)
+            scale = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_scale else None
+            bias = np.random.random_sample(scale_shape).astype(
+                np.float32) if has_bias else None
+            y_grad = (np.random.random_sample(x_shape) *
+                      y_grad_scale).astype(self.dtype)
+
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis)
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, bias, mean, variance, begin_norm_axis)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD']
+            if has_scale:
+                var_names += ['scale']
+            if has_bias:
+                var_names += ['bias']
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype=self.dtype,
+                        shape=ground_truth[name].shape)
+                inputs = {"X": block.var('x')}
+                fetch_list = [
+                    'y',
+                    'mean',
+                    'variance',
+                    'x@GRAD',
+                ]
+                if has_scale:
+                    inputs["Scale"] = block.var('scale')
+                    fetch_list += ['scale@GRAD']
+                if has_bias:
+                    inputs["Bias"] = block.var('bias')
+                    fetch_list += ['bias@GRAD']
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs=inputs,
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance":
+                        block.var('variance'),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn
+                    })
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
+                              },
+                              fetch_list=fetch_list)
+                self.__assert_close(y, out[0], "y", self.atol)
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad", 1e-2)
+                if has_scale:
+                    self.__assert_close(scale_grad,
+                                        out[fetch_list.index('scale@GRAD')],
+                                        "scale_grad", 1e-2)
+                if has_bias:
+                    self.__assert_close(bias_grad,
+                                        out[fetch_list.index('bias@GRAD')],
+                                        "bias_grad", self.atol)
+
+        test_with_place(self.place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=True)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=True,
+            has_bias=False)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5],
+            begin_norm_axis=1,
+            has_scale=False,
+            has_bias=False)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLayerNormOpFP16(TestLayerNormOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+        self.atol = 1e-2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cdd2448628a0b0f1900cc8b15d884d578a445ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLog(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "log"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.log(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "log"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.log(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.log(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b54be3a1482326f78039e78d4dae026f8445f9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
@@ -0,0 +1,122 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogicalNot(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "logical_not"
+        self.place = paddle.NPUPlace(4)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.logical_not(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.bool
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLogcialNotNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('bool')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='bool')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.logical_not(a)
+            d = paddle.cast(c, dtype="float32")
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(4)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={"a": a_np,
+                                               "label": label_np},
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..400ddd9d4aab0775af6007da36475db72561136f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -0,0 +1,89 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLookupTableV2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "lookup_table_v2"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        bsz = 6
+        seqlen = 8
+        vocab = 10
+        dim = 20
+        w = np.ones([vocab, dim]).astype(self.dtype)
+        x = np.random.randint(0, vocab, size=(bsz, seqlen)).astype(np.int64)
+        out = np.ones([bsz, seqlen, dim]).astype(self.dtype)
+
+        self.inputs = {
+            'W': OpTest.np_dtype_to_fluid_dtype(w),
+            'Ids': OpTest.np_dtype_to_fluid_dtype(x)
+        }
+        self.attrs = {
+            'is_sparse': False,
+            'is_distributed': False,
+            'remote_prefetch': False,
+            'padding_idx': -1
+        }
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['W'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestLookupTableV2FP16(TestLookupTableV2):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..b27b9c0b9756072c42fa7269f73821c18a7cc37e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
@@ -0,0 +1,210 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    return Out
+
+
+class TestMatMul(OpTest):
+    def config(self):
+        self.x_shape = (100, 24)
+        self.y_shape = (24, 100)
+        self.trans_x = False
+        self.trans_y = False
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "matmul_v2"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
+        result = reference_matmul(x, y, self.trans_x, self.trans_y)
+        result = result.astype(self.dtype)
+        self.inputs = {
+            'X': x,
+            'Y': y,
+        }
+        self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
+        self.outputs = {'Out': result}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+class TestMatMul2(TestMatMul):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (32, 24)
+        self.y_shape = (32, 24)
+        self.trans_x = False
+        self.trans_y = True
+
+
+class TestMatMul3(TestMatMul):
+    """
+    case 3
+    """
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestMatMul4(TestMatMul):
+    """
+    case 4 dim=3
+    """
+
+    def config(self):
+        self.x_shape = (2, 3, 4)
+        self.y_shape = (2, 4, 3)
+        self.trans_x = False
+        self.trans_y = False
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMatMulNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3)).astype('float32')
+        b_np = np.random.random(size=(2, 3)).astype('float32')
+        c_np = np.random.random(size=(3, 2)).astype('float32')
+        d_np = np.random.random(size=(3, 2)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3], dtype='float32')
+            c = paddle.static.data(name="c", shape=[3, 2], dtype='float32')
+            d = paddle.static.data(name="d", shape=[3, 2], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.matmul(sum_1, sum_2)
+
+            fc_1 = fluid.layers.fc(input=result, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e8f99a9dbb19785094ad6a94d9f371fe409fc69
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
@@ -0,0 +1,88 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMean(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([1, 100]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMeanFP16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "mean"
+        self.init_dtype()
+
+        x = np.random.random([3, 200]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        np_out = np.mean(x)
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
new file mode 100755
index 0000000000000000000000000000000000000000..63c4fb8e5885eaa33ba18227c3e89ce3b0c97b84
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
@@ -0,0 +1,104 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMemcpy_FillConstant(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = Program()
+        with program_guard(main_program):
+            cpu_var_name = "tensor@Cpu"
+            npu_var_name = "tensor@Npu"
+            cpu_var = main_program.global_block().create_var(
+                name=cpu_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+            npu_var = main_program.global_block().create_var(
+                name=npu_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+            main_program.global_block().append_op(
+                type="fill_constant",
+                outputs={"Out": npu_var_name},
+                attrs={
+                    "shape": [10, 10],
+                    "dtype": npu_var.dtype,
+                    "value": 1.0,
+                    "place_type": 1
+                })
+            main_program.global_block().append_op(
+                type="fill_constant",
+                outputs={"Out": cpu_var_name},
+                attrs={
+                    "shape": [10, 10],
+                    "dtype": cpu_var.dtype,
+                    "value": 0.0,
+                    "place_type": 2
+                })
+        return main_program, npu_var, cpu_var
+
+    def test_npu_cpoy_to_cpu(self):
+        main_program, npu_var, cpu_var = self.get_prog()
+        main_program.global_block().append_op(
+            type='memcpy',
+            inputs={'X': npu_var},
+            outputs={'Out': cpu_var},
+            attrs={'dst_place_type': 0})
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        npu_, cpu_ = exe.run(main_program,
+                             feed={},
+                             fetch_list=[npu_var.name, cpu_var.name])
+        self.assertTrue(np.allclose(npu_, cpu_))
+        self.assertTrue(np.allclose(cpu_, np.ones((10, 10))))
+
+    def test_cpu_cpoy_npu(self):
+        main_program, npu_var, cpu_var = self.get_prog()
+        main_program.global_block().append_op(
+            type='memcpy',
+            inputs={'X': cpu_var},
+            outputs={'Out': npu_var},
+            attrs={'dst_place_type': 4})
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        npu_, cpu_ = exe.run(main_program,
+                             feed={},
+                             fetch_list=[npu_var.name, cpu_var.name])
+        self.assertTrue(np.allclose(npu_, cpu_))
+        self.assertTrue(np.allclose(npu_, np.zeros((10, 10))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..4fcfd33b32f4e2ba24e5ab4569d5c7c586c5d881
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
@@ -0,0 +1,327 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestMul(OpTest):
+    def config(self):
+        self.x_shape = (32, 5)
+        self.y_shape = (5, 100)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+    #
+class TestMulFP16(TestMul):
+    """
+    case 2
+    """
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestMul3(TestMul):
+    """
+    case 3
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 5)
+        self.y_shape = (10, 5)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.outputs = {
+            'Out': np.dot(self.inputs['X'].reshape(2, 10), self.inputs['Y'])
+        }
+
+
+class TestMul4(TestMul):
+    """
+    case 4
+    """
+
+    def config(self):
+        self.x_shape = (2, 3, 4)
+        self.y_shape = (4, 5)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.attrs = {"x_num_col_dims": 2}
+        self.outputs = {'Out': np.matmul(self.inputs['X'], self.inputs['Y'])}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMulNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3)).astype('float32')
+        b_np = np.random.random(size=(2, 3)).astype('float32')
+        c_np = np.random.random(size=(3, 2)).astype('float32')
+        d_np = np.random.random(size=(3, 2)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3], dtype='float32')
+            c = paddle.static.data(name="c", shape=[3, 2], dtype='float32')
+            d = paddle.static.data(name="d", shape=[3, 2], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.fluid.layers.mul(sum_1, sum_2)
+
+            fc_1 = fluid.layers.fc(input=result, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("TestMulNet Start run on {} . ".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMulNet3_2(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        c_np = np.random.random(size=(12, 5)).astype('float32')
+        d_np = np.random.random(size=(12, 5)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+            c = paddle.static.data(name="c", shape=[12, 5], dtype='float32')
+            d = paddle.static.data(name="d", shape=[12, 5], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.fluid.layers.mul(sum_1, sum_2)
+
+            fc_1 = fluid.layers.fc(input=result, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("testMulNet3_2 tart run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(
+            npu_pred, cpu_pred, atol=1e-5))  # atol needed on cann 20.3
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMulNet3_2_xc2(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        c_np = np.random.random(size=(4, 5)).astype('float32')
+        d_np = np.random.random(size=(4, 5)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+            c = paddle.static.data(name="c", shape=[4, 5], dtype='float32')
+            d = paddle.static.data(name="d", shape=[4, 5], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)
+            sum_2 = paddle.add(c, d)
+            result = paddle.fluid.layers.mul(sum_1, sum_2, x_num_col_dims=2)
+            result_re = paddle.reshape(result, shape=[2, 15])
+
+            fc_1 = fluid.layers.fc(input=result_re, size=8)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("TestMulNet3_2_xc2. Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f71fad2b9c1084148d8b0a28e556cc0bf5f366e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
@@ -0,0 +1,61 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import numpy as np
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNpuPlace(unittest.TestCase):
+    def test(self):
+        p = core.Place()
+        p.set_place(paddle.NPUPlace(0))
+
+        self.assertTrue(p.is_npu_place())
+        self.assertEqual(p.npu_device_id(), 0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNpuPlaceError(unittest.TestCase):
+    def test_static(self):
+        # NPU is not supported in ParallelExecutor
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+
+            x_np = np.array([2, 3, 4]).astype('float32')
+            y_np = np.array([1, 5, 2]).astype('float32')
+
+            x = paddle.static.data(name="x", shape=[3], dtype='float32')
+            y = paddle.static.data(name="y", shape=[3], dtype='float32')
+            z = paddle.add(x, y)
+
+        compiled_prog = paddle.static.CompiledProgram(prog)
+        place = paddle.NPUPlace(0)
+        exe = paddle.static.Executor(place)
+
+        with self.assertRaisesRegex(RuntimeError,
+                                    "NPU is not supported in ParallelExecutor"):
+            exe.run(compiled_prog, feed={"x": x_np, "y": y_np})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c67766b31184a36446c4fa39f64f760fa23912c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
@@ -0,0 +1,151 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPow(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.power(x, 3)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'factor': 3.0}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "pow"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.power(x, 2)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'factor': 2.0}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..583a648224d7309bb1f2aa29db6871020091867f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAny8DOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 5, 4)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAnyOpWithDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1]}
+        self.outputs = {'Out': self.inputs['X'].any(axis=1)}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAny8DOpWithDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (3, 6)}
+        self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAnyOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestAny8DOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reduce_any"
+        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.randint(0, 2,
+                                   (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+        }
+        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3861bf0780cb58f7362ff9dbd05c99a222bc21b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
@@ -0,0 +1,206 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSum(OpTest):
+    def setUp(self):
+        np.random.seed(SEED)
+        self.set_npu()
+        self.init_dtype()
+        self.place = paddle.NPUPlace(0)
+        self.init_op_type()
+        self.initTestCase()
+
+        self.use_mkldnn = False
+        self.attrs = {
+            'dim': self.axis,
+            'keep_dim': self.keep_dim,
+            'reduce_all': self.reduce_all
+        }
+        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
+        if self.attrs['reduce_all']:
+            self.outputs = {'Out': self.inputs['X'].sum()}
+        else:
+            self.outputs = {
+                'Out': self.inputs['X'].sum(axis=self.axis,
+                                            keepdims=self.attrs['keep_dim'])
+            }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_op_type(self):
+        self.op_type = "reduce_sum"
+        self.use_mkldnn = False
+        self.keep_dim = False
+        self.reduce_all = False
+
+    def initTestCase(self):
+        self.shape = (5, 6)
+        self.axis = (0, )
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+class TestReduceSum2(OpTest):
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSumNet(unittest.TestCase):
+    def set_reduce_sum_function(self, x):
+        # keep_dim = False
+        return paddle.fluid.layers.reduce_sum(x, dim=-1)
+
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            a_1 = fluid.layers.fc(input=a, size=4, num_flatten_dims=2, act=None)
+            b_1 = fluid.layers.fc(input=b, size=4, num_flatten_dims=2, act=None)
+            z = paddle.add(a_1, b_1)
+            z_1 = self.set_reduce_sum_function(z)
+
+            prediction = fluid.layers.fc(input=z_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSumNet2(TestReduceSumNet):
+    def set_reduce_sum_function(self, x):
+        # keep_dim = True
+        return paddle.fluid.layers.reduce_sum(x, dim=-1, keep_dim=True)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReduceSumNet3(TestReduceSumNet):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(2, 3, 4)).astype('float32')
+        b_np = np.random.random(size=(2, 3, 4)).astype('float32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32')
+            b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32')
+
+            z = paddle.add(a, b)
+            loss = fluid.layers.reduce_sum(z)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            loss_res = exe.run(main_prog,
+                               feed={"a": a_np,
+                                     "b": b_np},
+                               fetch_list=[loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Loss: {}".format(epoch, loss_res))
+
+        return loss_res, loss_res
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9273d01299d8f564ee0ae575b47bb30e939c3d76
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
@@ -0,0 +1,176 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestRelu(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReluFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.rand(3, 2).astype(self.dtype)
+        out = x
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReluNeg(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.array([0.1, -0.1, -1.0]).astype(self.dtype)
+        out = np.array([0.1, 0.0, 0.0]).astype(self.dtype)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+#
+#
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReluNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.nn.functional.relu(sum)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..885c990c702bd35d2052b3cb79abf11a74b3efc2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
@@ -0,0 +1,77 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestReshape2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "reshape2"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_data()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {"shape": self.new_shape}
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype("float32")
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_data(self):
+        self.ori_shape = (2, 100)
+        self.new_shape = (20, 10)
+        self.infered_shape = (20, 10)
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            self.place, check_dygraph=False, no_check_set=['XShape'])
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', check_dygraph=False)
+
+
+class TestReshape2_case2(TestReshape2):
+    def init_data(self):
+        self.ori_shape = (2, 100)
+        self.new_shape = (-1, 10)
+        self.infered_shape = (20, 10)
+
+
+class TestReshape2_case3(TestReshape2):
+    def init_data(self):
+        self.ori_shape = (100, 5, 6)
+        self.new_shape = (-1, 0, 3)
+        self.infered_shape = (200, 5, 3)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7e7fb39c913b28f1e3597a5a5ba4f57f98c108d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+sys.path.append("..")
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.nn import Embedding
+import paddle.fluid.framework as framework
+from paddle.fluid.optimizer import Adam
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+from paddle.fluid.executor import global_scope
+import numpy as np
+import six
+import pickle
+import os
+import errno
+from test_static_save_load import *
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadBase(TestSaveLoadBase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadPartial(TestSaveLoadPartial):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadSetStateDict(TestSaveLoadSetStateDict):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStatePartial(TestProgramStatePartial):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPULoadFromOldInterface(TestLoadFromOldInterface):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPULoadFromOldInterfaceSingleFile(TestLoadFromOldInterfaceSingleFile):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStateOldSave(TestProgramStateOldSave):
+    def setUp(self):
+        self.test_dygraph = False
+
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStateOldSaveSingleModel(TestProgramStateOldSaveSingleModel):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b4547bc24474afccf2454f992a1c92c3dd22605
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
@@ -0,0 +1,89 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestScale(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scale"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(
+                np.random.random((10, 10)).astype(self.dtype))
+        }
+        self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestFP16Scale(TestScale):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestBiasAfterScale(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scale"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(
+                np.random.random((10, 10)).astype(self.dtype))
+        }
+        self.attrs = {'scale': -2.3, 'bias': 0, 'bias_after_scale': False}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.dtype(self.attrs['scale'])
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
new file mode 100755
index 0000000000000000000000000000000000000000..c3e52c9bfad533bdef724cff7e447f991fa2d6b2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
@@ -0,0 +1,126 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestCast1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("float32")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.random.random((1, 2)).astype("float32")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("int32")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.zeros((1, 2)).astype("int32")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("float32")
+        index_np = np.array([1]).astype("int32")
+        updates_np = np.random.random((1, 2)).astype("float32")
+
+        output_np = np.copy(ref_np)
+        output_np[index_np] += updates_np
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': False}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestCast4(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "scatter"
+        self.place = paddle.NPUPlace(0)
+
+        ref_np = np.ones((3, 2)).astype("float32")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 2)).astype("float32")
+
+        output_np = np.copy(ref_np)
+        output_np[1] = updates_np[0]
+        output_np[2] = updates_np[1]
+        self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+        self.attrs = {'overwrite': True}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..af0dea4776d23fdebe26f68b5c84c7d3d07d2940
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
@@ -0,0 +1,119 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSGD(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "sgd"
+        self.conf()
+        w = np.random.random((self.h, self.w)).astype("float32")
+        g = np.random.random((self.h, self.w)).astype("float32")
+        lr = np.array([0.1]).astype("float32")
+
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.outputs = {'ParamOut': w - lr * g}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def conf(self):
+        self.h = 12
+        self.w = 15
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b9a74b2be98dee86f2b3192d746cc56895ca1d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
@@ -0,0 +1,57 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestShape(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "shape"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [5, 10]).astype(self.dtype)
+        out = np.array([5, 10])
+
+        self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..500618f509f682b00be715ea8214cddaaf892b2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+SEED = 2021
+EPOCH = 100
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_npu()
+        self.init_dtype()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', check_dygraph=False)
+
+
+class TestSliceOp2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, -3]
+        self.ends = [3, 3, -1]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, -3:-1, :]
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSliceOpFp16(TestSliceOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+        self.place = paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSliceNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        batch_size = 32
+        data_shape = (32, 32)
+        a_np = np.random.random(size=data_shape).astype('float32')
+        b_np = np.random.random(size=data_shape).astype('float32')
+        label_np = np.random.randint(2, size=(batch_size, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=data_shape, dtype='float32')
+            b = paddle.static.data(name="b", shape=data_shape, dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[batch_size, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.slice(sum, axes=[0, 1], starts=[0, 0], ends=[33, 2])
+
+            prediction = paddle.static.nn.fc(z, size=2, activation='softmax')
+
+            cost = paddle.nn.functional.cross_entropy(
+                input=prediction, label=label)
+            loss = paddle.mean(cost)
+            sgd = paddle.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+        print("Start run on {}".format(place))
+        for epoch in range(EPOCH):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1ba41943a359ba2103bfd34c722c697d6b01b2f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
@@ -0,0 +1,125 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSoftmax(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "softmax"
+        self.init_dtype()
+
+        x = np.random.random([3, 3]).astype(self.dtype)
+        np_out = np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        self.outputs = {'Out': np_out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSoftmaxNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(4, 32)).astype('float32')
+        b_np = np.random.random(size=(4, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(4, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[4, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[4, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[4, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.sqrt(c)
+
+            # 4 x 128
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            # 4 x 2
+            prediction = fluid.layers.fc(input=fc_1, size=2)
+
+            # 4 x 2
+            prob = fluid.layers.softmax(prediction, axis=1)
+
+            cost = fluid.layers.cross_entropy(input=prob, label=label)
+            loss = fluid.layers.mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-2))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b48268b0e77e6804d3a26bd58918a4c484d3732
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
@@ -0,0 +1,159 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from test_softmax_op import stable_softmax
+from test_softmax_with_cross_entropy_op import cross_entropy
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSoftmaxWithCrossEntropyOp(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def initParams(self):
+        self.set_npu()
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = False
+        self.place = paddle.NPUPlace(0)
+        self.soft_label = False
+        self.init_dtype()
+        self.axis = -1
+        self.ignore_index = -1
+        self.shape = [41, 37]
+        np.random.seed(SEED)
+
+    def setUp(self):
+        self.initParams()
+
+        logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
+
+        if self.soft_label:
+            labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+            labels /= np.sum(labels, axis=self.axis, keepdims=True)
+        else:
+            axis_dim = self.shape[self.axis]
+            self.shape[self.axis] = 1
+            labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
+
+        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
+                             self.ignore_index)
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": loss.astype(self.dtype)
+        }
+        self.attrs = {
+            "numeric_stable_mode": self.numeric_stable_mode,
+            "soft_label": self.soft_label,
+            "ignore_index": self.ignore_index,
+        }
+
+        if self.axis != -1:
+            self.attrs['axis'] = self.axis
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2)
+
+            cost = fluid.layers.softmax_with_cross_entropy(prediction, label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..556fa76424b8b60f2efff371c833f57bdc341e40
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSqrt(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "sqrt"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSqrtFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "sqrt"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSqrtNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.sqrt(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c1a8d0070484a3b536256a6e8aafeb20fcf0ae0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSquare(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "square"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.square(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSquareFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "square"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.square(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSquareNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.square(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6db98be9328a4316821f89ebb5d6c145c6711975
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
@@ -0,0 +1,153 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestStack1(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (5, 6, 7)
+        self.axis = 0
+        self.dtype = 'float32'
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.set_npu()
+        self.op_type = "stack"
+        self.place = paddle.NPUPlace(0)
+
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestStack2(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (2, 3, 4)
+        self.axis = -1
+        self.dtype = 'float32'
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.set_npu()
+        self.op_type = "stack"
+        self.place = paddle.NPUPlace(0)
+
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestStack3(OpTest):
+    def initDefaultParameters(self):
+        self.num_inputs = 4
+        self.input_dim = (2, 3, 4)
+        self.axis = 1
+        self.dtype = 'float32'
+
+    def get_x_names(self):
+        x_names = []
+        for i in range(self.num_inputs):
+            x_names.append('x{}'.format(i))
+        return x_names
+
+    def setUp(self):
+        self.initDefaultParameters()
+        self.set_npu()
+        self.op_type = "stack"
+        self.place = paddle.NPUPlace(0)
+
+        self.x = []
+        for i in range(self.num_inputs):
+            self.x.append(
+                np.random.random(size=self.input_dim).astype(self.dtype))
+
+        tmp = []
+        x_names = self.get_x_names()
+        for i in range(self.num_inputs):
+            tmp.append((x_names[i], self.x[i]))
+
+        self.inputs = {'X': tmp}
+        self.outputs = {'Y': np.stack(self.x, axis=self.axis)}
+        self.attrs = {'axis': self.axis}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
new file mode 100755
index 0000000000000000000000000000000000000000..6d39aa383ce9495494c2cc90bd2c4fee573b0fd1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
@@ -0,0 +1,86 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestSum1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.NPUPlace(0)
+
+        x0 = np.random.random((3, 40)).astype(self.dtype)
+        x1 = np.random.random((3, 40)).astype(self.dtype)
+        x2 = np.random.random((3, 40)).astype(self.dtype)
+        self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2)]}
+        y = x0 + x1 + x2
+        self.outputs = {'Out': y}
+
+        self.attrs = {'use_mkldnn': False}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestSum2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.init_dtype()
+        self.op_type = "sum"
+        self.place = paddle.NPUPlace(0)
+
+        x0 = np.random.random((3, 3)).astype(self.dtype)
+        x1 = np.random.random((3, 3)).astype(self.dtype)
+        x2 = np.random.random((3, 3)).astype(self.dtype)
+        x3 = np.random.random((3, 3)).astype(self.dtype)
+        self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]}
+        y = x0 + x1 + x2 + x3
+        self.outputs = {'Out': y}
+
+        self.attrs = {'use_mkldnn': False}
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..235fa2783fb3c8c507ebfa73c5631c551fce4f1a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
@@ -0,0 +1,154 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTanh(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "tanh"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Add grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTanhFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "tanh"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.tanh(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTanhNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.tanh(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..04d4565f7485808daa706a9781c2b7159ab9222a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
@@ -0,0 +1,95 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTopk(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "top_k"
+        self.init_dtype()
+
+        x = np.array([[0.78104149, 0.88745828, 0.32362268],
+                      [0.82196718, 0.48763277, 0.42826136],
+                      [0.96527182, 0.34851612, 0.12959783]]).astype(self.dtype)
+
+        self.inputs = {'X': x}
+        np_out = np.array(
+            [[0.88745828], [0.82196718], [0.96527182]]).astype(self.dtype)
+        np_indices = np.array([[1], [0], [0]])
+
+        self.attrs = {'k': 1, "axis": -1}
+        self.outputs = {'Out': np_out, 'Indices': np_indices}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTopkV2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "top_k"
+        self.init_dtype()
+
+        x = np.array([[0.78104149, 0.88745828, 0.32362268],
+                      [0.82196718, 0.48763277, 0.42826136],
+                      [0.96527182, 0.34851612, 0.12959783]]).astype(self.dtype)
+
+        self.inputs = {'X': x}
+        np_out = np.array([[0.88745828, 0.78104149], [0.82196718, 0.48763277],
+                           [0.96527182, 0.34851612]]).astype(self.dtype)
+        np_indices = np.array([[1, 0], [0, 1], [0, 1]])
+
+        self.attrs = {'k': 2, "axis": -1}
+        self.outputs = {'Out': np_out, 'Indices': np_indices}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..17f6a0ae1ca9bffcb78b3526f78f9a26e4546fc4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -0,0 +1,74 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, _set_use_system_allocator
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "transpose2"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
+        self.attrs = {'axis': [0, 2, 1, 3], 'data_format': 'AnyLayout'}
+        self.outputs = {'Out': self.out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_kernel_type(self):
+        self.use_mkldnn = False
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [8, 512, 12, 64]).astype(self.dtype)
+        self.out = np.transpose(self.x, [0, 2, 1, 3])
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_axis(self):
+        self.axis = -1
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTransposeOpFP16(TestTransposeOp):
+    no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff89508d196235a8e50678908938ba0fc24d6981
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
@@ -0,0 +1,71 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestTruncatedNormal(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        scope = paddle.fluid.core.Scope()
+
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+        paddle.seed(SEED)
+
+        with fluid.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                weight_attr = paddle.framework.ParamAttr(
+                    name="linear_weight",
+                    initializer=paddle.nn.initializer.TruncatedNormal(
+                        mean=0.0, std=2.0))
+                linear = paddle.nn.Linear(
+                    2, 2, weight_attr=weight_attr, bias_attr=False)
+
+            if run_npu:
+                place = paddle.NPUPlace(0)
+            else:
+                place = paddle.CPUPlace()
+
+            exe = paddle.static.Executor(place)
+            w = exe.run(startup_prog, fetch_list=['linear_weight'])
+            return w
+
+    def test_npu(self):
+        cpu_w = self._test(False)
+        npu_w = self._test(True)
+
+        self.assertTrue(np.allclose(npu_w, cpu_w))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..cae3239229f441812a42be11ee8d8f34253cff05
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
@@ -0,0 +1,264 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestUpdateLossScalingOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "update_loss_scaling"
+        self.place = paddle.NPUPlace(0)
+
+        self.init()
+        found_inf = np.array([False], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', x)],
+            'LossScaling': self.prev_loss_scaling * self.incr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init(self):
+        self.incr_ratio = 2.0
+        self.decr_ratio = 0.8
+        self.dtype = np.float32
+        self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
+        self.num_good_steps = np.array([999], dtype=np.int32)
+        self.num_bad_steps = np.array([1], dtype=np.int32)
+        self.zero_steps = np.array([0], dtype=np.int32)
+        self.attrs = {
+            'incr_every_n_steps': 1000,
+            'decr_every_n_nan_or_inf': 2,
+            'incr_ratio': self.incr_ratio,
+            'decr_ratio': self.decr_ratio,
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "update_loss_scaling"
+        self.place = paddle.NPUPlace(0)
+
+        self.init()
+        found_inf = np.array([True], dtype=np.bool)
+        x = np.random.random((1024, 1024)).astype(self.dtype)
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        x[i[0]][j[0]] = np.inf
+
+        self.inputs = {
+            'X': [('x0', x)],
+            'FoundInfinite': found_inf,
+            'PrevLossScaling': self.prev_loss_scaling,
+            'InGoodSteps': self.num_good_steps,
+            'InBadSteps': self.num_bad_steps
+        }
+
+        self.outputs = {
+            'Out': [('out0', np.zeros_like(x))],
+            'LossScaling': self.prev_loss_scaling * self.decr_ratio,
+            'OutGoodSteps': self.zero_steps,
+            'OutBadSteps': self.zero_steps
+        }
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestUpdateLossScalingLayer(unittest.TestCase):
+    def loss_scaling_check(self, use_npu=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        found_inf_v = np.array([False]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], a_v)
+        assert np.array_equal(result_v[1], b_v)
+        assert np.array_equal(result_v[0], result_v[2])
+        assert np.array_equal(result_v[1], result_v[3])
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def loss_scaling_check_inf(self, use_npu=True, scope=fluid.Scope()):
+        a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
+        b = fluid.data(name="b", shape=[512, 128], dtype='float32')
+        x = [a, b]
+        found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
+        prev_loss_scaling = fluid.data(
+            name="prev_loss_scaling", shape=[1], dtype='float32')
+        num_good_steps = fluid.data(
+            name="num_good_steps", shape=[1], dtype='int32')
+        num_bad_steps = fluid.data(
+            name="num_bad_steps", shape=[1], dtype='int32')
+
+        a_v = np.random.random([1024, 1024]).astype('float32')
+        b_v = np.random.random([512, 128]).astype('float32')
+        i = np.random.randint(0, 1024, 1)
+        j = np.random.randint(0, 1024, 1)
+        a_v[i[0]][j[0]] = np.inf
+        found_inf_v = np.array([True]).astype('bool')
+        prev_loss_scaling_v = np.array([2048]).astype('float32')
+        num_good_steps_v = np.array([999], dtype=np.int32)
+        num_bad_steps_v = np.array([1], dtype=np.int32)
+
+        incr_every_n_steps = 1000
+        decr_every_n_nan_or_inf = 2
+        incr_ratio = 2
+        decr_ratio = 0.8
+
+        result = amp_nn.update_loss_scaling(
+            x,
+            found_inf,
+            prev_loss_scaling,
+            num_good_steps,
+            num_bad_steps,
+            incr_every_n_steps,
+            decr_every_n_nan_or_inf,
+            incr_ratio,
+            decr_ratio,
+            name="update_loss_scaling")
+
+        place = paddle.NPUPlace(0) if use_npu else fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        with fluid.scope_guard(scope):
+            exe.run(fluid.default_startup_program())
+            result_v = exe.run(feed={
+                'a': a_v,
+                'b': b_v,
+                'found_inf': found_inf_v,
+                'prev_loss_scaling': prev_loss_scaling_v,
+                'num_good_steps': num_good_steps_v,
+                'num_bad_steps': num_bad_steps_v
+            },
+                               fetch_list=[
+                                   result, x, found_inf, prev_loss_scaling,
+                                   num_good_steps, num_bad_steps
+                               ])
+        assert np.array_equal(result_v[0], np.zeros_like(a_v))
+        assert np.array_equal(result_v[1], np.zeros_like(b_v))
+        assert np.array_equal(result_v[2], np.zeros_like(a_v))
+        assert np.array_equal(result_v[3], np.zeros_like(b_v))
+        assert np.array_equal(result_v[4], found_inf_v)
+        assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
+        assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
+        assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
+
+    def test_loss_scaling_cpu(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check(use_npu=False)
+
+    def test_loss_scaling_cpu_inf(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check_inf(use_npu=False)
+
+    def test_loss_scaling_npu(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check(use_npu=True)
+
+    def test_loss_scaling_npu_inf(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.unique_name.guard():
+            with fluid.program_guard(main, startup):
+                self.loss_scaling_check_inf(use_npu=True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 8ca83d08d64de20b1493bdbecd216ca176ab8ce8..25717b796771284c358bc690d80b7897d59edea4 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -33,10 +33,19 @@ from paddle.fluid.backward import append_backward
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
 from paddle.fluid.framework import Program, OpProtoHolder, Variable
-from testsuite import create_op, set_input, append_input_output, append_loss_ops
+from paddle.fluid.tests.unittests.testsuite import (
+    create_op,
+    set_input,
+    append_input_output,
+    append_loss_ops, )
 from paddle.fluid import unique_name
-from white_list import op_accuracy_white_list, check_shape_white_list, compile_vs_runtime_white_list, no_check_set_white_list
-from white_list import op_threshold_white_list, no_grad_set_white_list
+from paddle.fluid.tests.unittests.white_list import (
+    op_accuracy_white_list,
+    check_shape_white_list,
+    compile_vs_runtime_white_list,
+    no_check_set_white_list,
+    op_threshold_white_list,
+    no_grad_set_white_list, )
 
 
 def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs):
@@ -235,17 +244,12 @@ def convert_float_to_uint16(float_list, data_format="NCHW"):
     return new_output
 
 
-def copy_bits_from_uint16_to_float(i):
-    i = np.uint32(i) << 16
-    return struct.unpack('<f', struct.pack('<I', i))[0]
-
-
-def convert_uint16_to_float(uint16_list):
-    new_output = []
-    for x in np.nditer(uint16_list):
-        new_output.append(np.float32(copy_bits_from_uint16_to_float(x)))
-
-    return np.reshape(new_output, uint16_list.shape).view(np.float32)
+def convert_uint16_to_float(in_list):
+    in_list = np.asarray(in_list)
+    out = np.vectorize(
+        lambda x: struct.unpack('<f', struct.pack('<I', x << 16))[0],
+        otypes=[np.float32])(in_list.flat)
+    return np.reshape(out, in_list.shape)
 
 
 class OpTest(unittest.TestCase):
@@ -262,7 +266,10 @@ class OpTest(unittest.TestCase):
         np.random.seed(123)
         random.seed(124)
 
-        cls._use_system_allocator = _set_use_system_allocator(True)
+        if paddle.is_compiled_with_npu():
+            cls._use_system_allocator = _set_use_system_allocator(False)
+        else:
+            cls._use_system_allocator = _set_use_system_allocator(True)
 
     @classmethod
     def tearDownClass(cls):
@@ -294,6 +301,9 @@ class OpTest(unittest.TestCase):
         def is_rocm_op_test():
             return core.is_compiled_with_rocm()
 
+        def is_npu_op_test():
+            return hasattr(cls, "use_npu") and cls.use_npu == True
+
         if not hasattr(cls, "op_type"):
             raise AssertionError(
                 "This test do not have op_type in class attrs, "
@@ -315,7 +325,8 @@ class OpTest(unittest.TestCase):
                 and not hasattr(cls, 'exist_fp64_check_grad') \
                 and not is_xpu_op_test() \
                 and not is_mkldnn_op_test() \
-                and not is_rocm_op_test():
+                and not is_rocm_op_test() \
+                and not is_npu_op_test():
                 raise AssertionError(
                     "This test of %s op needs check_grad with fp64 precision." %
                     cls.op_type)
@@ -1160,7 +1171,9 @@ class OpTest(unittest.TestCase):
                 expect = self.outputs[out_name]
                 expect_t = expect[0] if isinstance(expect, tuple) else expect
 
-                if actual_t.dtype == np.uint16 and expect_t.dtype == np.float32:
+                if actual_t.dtype == np.uint16 and expect_t.dtype in [
+                        np.float32, np.float64
+                ]:
                     actual_t = convert_uint16_to_float(actual_t)
                     atol = 0.03
 
@@ -1212,7 +1225,8 @@ class OpTest(unittest.TestCase):
         # Check inplace for given op, its grad op, its grad_grad op, etc.
         # No effect on original OpTest
         # Currently not support ParallelExecutor on XPUPlace.
-        if not paddle.is_compiled_with_xpu():
+        if not paddle.is_compiled_with_xpu(
+        ) and not paddle.is_compiled_with_npu():
             self.check_inplace_output_with_place(
                 place, no_check_set=no_check_set, inplace_atol=inplace_atol)
 
@@ -1437,9 +1451,18 @@ class OpTest(unittest.TestCase):
         if not type(output_names) is list:
             output_names = [output_names]
 
+        # FIXME: Replace numeric_place with place to calculate numeric_grads.
+        # NOTE(liym27): There is an unknown error when call op.run() on NPUPlace, which
+        # needs to be fixed.
+        if hasattr(self.__class__,
+                   "use_npu") and self.__class__.use_npu == True:
+            numeric_place = paddle.CPUPlace()
+        else:
+            numeric_place = place
+
         numeric_grads = user_defined_grads or [
             get_numeric_gradient(
-                place,
+                numeric_place,
                 self.scope,
                 self.op,
                 self.inputs,
@@ -1452,6 +1475,7 @@ class OpTest(unittest.TestCase):
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set,
                                             user_defined_grad_outputs)
+
         # comparison of bf16 results will happen as fp32
         # loop over list of grads and convert bf16 to fp32
         fp32_grads = []
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 37b446174d6d03572b6df14fd2d0db3c63dd9013..133367a5f3625a2da1832b210b0534d4a2178a2d 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -296,7 +296,7 @@ class XPUOpTest(OpTest):
             no_grad_set=no_grad_set)
         self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
                               "Gradient Check On two xpu")
-        self._assert_is_close(a1, a3, inputs_to_check, 0.001,
+        self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
                               "Gradient Check On cpu & xpu")
 
     def get_grad_with_place(self,
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
new file mode 100644
index 0000000000000000000000000000000000000000..26c9944abd6c6c97baca8b56caa18644e5615977
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle.distributed as dist
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Embedding
+import paddle.nn.functional as F
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+paddle.seed(123)
+np.random.seed(2021)
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self, hidden_size, vocab_size, is_sparse=False):
+        super(SimpleNet, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.embedding = Embedding(
+            size=[self.vocab_size, self.hidden_size],
+            dtype='float32',
+            is_sparse=is_sparse)
+
+        self.lin_a = paddle.nn.Linear(self.hidden_size, self.vocab_size)
+        self.lin_b = paddle.nn.Linear(self.vocab_size, 1)
+
+        self.unused_net = paddle.nn.Linear(5, 3)
+        self.phony = self.create_parameter(shape=[1], dtype="float32")
+
+    def forward(self, input, label, conf):
+        x_emb = self.embedding(input)
+        fc = self.lin_a(x_emb)
+        mask = conf > 0
+        mask = paddle.cast(mask, dtype="int64")
+        mask.stop_gradient = True
+        emb_mask = mask.max(1).flatten()
+        emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
+        emb_mask_inds.stop_gradient = True
+
+        if emb_mask_inds.numel() == 0:
+            loss_box = self.phony * 0
+        else:
+            projection = self.lin_b(fc)
+            projection = paddle.reshape(projection, shape=[-1, 1])
+            output = paddle.gather(projection, emb_mask_inds)
+            target = paddle.gather(label, emb_mask_inds)
+            loss_box = F.smooth_l1_loss(
+                output, target, reduction='sum', delta=1.0)
+            loss_box = loss_box / len(conf)
+
+        return loss_box
+
+
+# global configs
+batch_size = 4
+batch_num = 2000
+hidden_size = 5
+vocab_size = 100
+
+conf_dataset = [[0], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1],
+                [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [0], [1]]
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.randint(0, vocab_size)
+            y_data = np.random.random_sample((1, )).astype('float32')
+            conf_data = np.array(conf_dataset[i % len(conf_dataset)]).astype(
+                'int64')
+            yield x_data, y_data, conf_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet(
+            hidden_size=hidden_size, vocab_size=vocab_size, is_sparse=False)
+
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x[0] for x in batch]).astype('int64')
+        y_data = np.array([x[1] for x in batch]).astype('float32')
+        conf_data = np.array([x[2] for x in batch]).astype('int64')
+        x_data = x_data.reshape((-1, 1))
+        y_data = y_data.reshape((-1, 1))
+        conf_data = conf_data.reshape((-1, 1))
+
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        conf = paddle.to_tensor(conf_data)
+
+        loss = model(x, y, conf)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
new file mode 100644
index 0000000000000000000000000000000000000000..3157d5e4129eebc5f74d09dd506c34371db009f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.dygraph.base import to_variable
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+np.random.seed(2021)
+paddle.seed(1024)
+
+batch_size = 4
+batch_num = 1000
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.net_a = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_b = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_unused = Linear(10, 20)
+        self.step = 0
+
+    def forward(self, x):
+        if self.step % 2 == 0:
+            return self.net_a(x)
+        else:
+            return self.net_b(x)
+
+        self.step = self.step + 1
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.random_sample((10, )).astype('float32')
+            yield x_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet()
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x for x in batch])
+        x_data = x_data.reshape((-1, 10))
+        x = to_variable(x_data)
+        out = model(x)
+        loss = out.sum() / len(batch)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c518976d1f36ce6f64e2228675131b62e6f2f5a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
@@ -0,0 +1,137 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+
+paddle.seed(1024)
+np.random.seed(2021)
+
+batch = 5
+in_dim = 10
+out_dim = 20
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self, train_id):
+        super(SimpleNet, self).__init__()
+        self.w1 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.w2 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.share_net = Linear(out_dim, 10)
+
+        self.unused_param = self.create_parameter(
+            shape=[out_dim, in_dim], dtype="float64")
+
+        # just for test sync_params_buffers
+        self.register_buffer("queue", paddle.randn([10, 5]))
+        self.queue = paddle.nn.functional.normalize(self.queue, axis=0)
+        self.register_buffer("queue_ptr", paddle.zeros([1], 'int64'))
+
+        self.trainer_id = train_id
+
+    def forward(self, x):
+        is_use = (paddle.equal_all(
+            x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and
+                  self.trainer_id == 1)
+
+        if is_use:
+            tmp = paddle.matmul(x, self.w1)
+        else:
+            tmp = paddle.matmul(x, self.w2)
+
+        return self.share_net(tmp)
+
+
+class TestDistTraning(unittest.TestCase):
+    def test_multiple_gpus(self):
+        dist.init_parallel_env()
+        self.trainer_id = dist.get_rank()
+
+        model_a = SimpleNet(self.trainer_id)
+        model_b = SimpleNet(self.trainer_id)
+
+        state_dict = model_a.state_dict()
+        model_b.set_state_dict(state_dict)
+
+        model_a = paddle.DataParallel(model_a, find_unused_parameters=True)
+        model_b = paddle.DataParallel(model_b, find_unused_parameters=True)
+
+        ones_input = paddle.ones(shape=(batch, in_dim))
+        ones_input.stop_gradient = True
+
+        w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+        w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+
+        for step_id in range(5):
+            random_input = paddle.rand(shape=(batch, in_dim))
+            random_input.stop_gradient = True
+
+            if step_id % 2 == 0:
+                out_a = model_a(random_input)
+                out_b = model_b(random_input)
+            else:
+                out_a = model_a(ones_input)
+                out_b = model_b(ones_input)
+
+            out_a.sum().backward()
+            out_b.sum().backward()
+
+            self.check_gradient(model_a.parameters())
+            self.check_gradient(model_b.parameters())
+
+            # test acc gradient
+            w1_grad_sum = self.check_acc(model_a._layers.w1.grad, w1_grad_sum,
+                                         model_b._layers.w1.grad)
+            w2_grad_sum = self.check_acc(model_a._layers.w2.grad, w2_grad_sum,
+                                         model_b._layers.w2.grad)
+
+            model_a.clear_gradients()
+
+    def check_acc(self, grad, grad_sum, acc_grad):
+        if grad is not None:
+            grad_sum = grad_sum + grad.numpy()
+            acc_grad = acc_grad.numpy() if acc_grad is not None else None
+            np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6)
+        return grad_sum
+
+    def print_trainer_0(self, *args):
+        if self.trainer_id == 0:
+            print(*args)
+
+    def broadcast_param(self, param, root):
+        paddle.distributed.broadcast(param, root)
+        return param
+
+    def check_gradient(self, params):
+        other_param = []
+        for param in params:
+            if param.trainable and (param._grad_ivar() is not None):
+                grad = param._grad_ivar()
+                other_grad = self.broadcast_param(grad.clone(), root=1)
+                if self.trainer_id == 0:
+                    np.testing.assert_allclose(other_grad.numpy(), grad.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc0246a9720bfdbeb740f54632f0bbacd7831479
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Linear
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+np.random.seed(2021)
+paddle.seed(1024)
+
+batch_size = 4
+batch_num = 1000
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.net_a = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_b = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.step = 0
+
+    def forward(self, x):
+        return paddle.to_tensor(0.0, dtype='float32')
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.random_sample((10, )).astype('float32')
+            yield x_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet()
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x for x in batch])
+        x_data = x_data.reshape((-1, 10))
+        x = paddle.to_tensor(x_data)
+        out = model(x)
+        loss = out.sum() / len(batch)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
new file mode 100644
index 0000000000000000000000000000000000000000..facac33e4c60ec884c104a9ab069ab4875490c61
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
+from paddle.fluid.dygraph.base import to_variable
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+np.random.seed(2021)
+paddle.seed(1024)
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self):
+        # bias is unused parameters, and it share with net_a
+        super(SimpleNet, self).__init__()
+        self.net_a = Linear(input_dim=10, output_dim=5)
+        self.net_b = Linear(10, 10)
+        self.bias = self.net_a.bias
+
+    def forward(self, x):
+        return self.net_b(x)
+
+
+batch_size = 4
+batch_num = 1000
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.random_sample((10, )).astype('float32')
+            yield x_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet()
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x for x in batch])
+        x_data = x_data.reshape((-1, 10))
+        x = to_variable(x_data)
+        out = model(x)
+        loss = out.sum() / len(batch)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
index 65c242a7023093413ff0389aa685ddc817eea028..a15b263a295086271efd5095e61f7d4a42857db9 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
@@ -65,8 +65,6 @@ class SimpleNet(Layer):
     def forward(self, input, label):
         x_emb = self.embedding(input)
         fc = paddle.matmul(x_emb, self.softmax_weight)
-        # use detach to stop gradient
-        fc = fc.detach()
         fc = paddle.add(fc, self.softmax_bias)
         projection = paddle.reshape(fc, shape=[-1, self.vocab_size])
         loss = paddle.nn.functional.softmax_with_cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
index 1884eef15e9a409319820ac61444094755116e86..9f877381101e96fc57dbce127dad18a034e3b2a1 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
@@ -37,7 +37,7 @@ class SimpleNet(Layer):
         self.embedding = Embedding(
             self.vocab_size,
             self.hidden_size,
-            sparse=True,
+            sparse=is_sparse,
             weight_attr=paddle.ParamAttr(
                 name='embedding_param',
                 initializer=paddle.nn.initializer.Uniform(
@@ -105,7 +105,7 @@ class TestSparseEmbeddingUnusedVars(TestParallelDyGraphRunnerBase):
             vocab_size=vocab_size,
             num_steps=num_steps,
             init_scale=init_scale,
-            is_sparse=True)
+            is_sparse=False)
 
         train_reader = paddle.batch(
             fake_sample_reader(), batch_size=batch_size, drop_last=True)
diff --git a/python/paddle/fluid/tests/unittests/parallel_test.sh b/python/paddle/fluid/tests/unittests/parallel_test.sh
index 9da4f035345d7f04b69a1c9483cba7022ad10baa..551b7cdb7a43c16fe66a10c5e8176ff4c1ad9d58 100644
--- a/python/paddle/fluid/tests/unittests/parallel_test.sh
+++ b/python/paddle/fluid/tests/unittests/parallel_test.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 unset https_proxy http_proxy
 export FLAGS_rpc_disable_reuse_port=1
 
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
index d06be76b331a743f6cf4b90fec11d118bbdc5ac3..8c3a66f933f59ddb01a624c57c3b1573e71c953e 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py
@@ -66,12 +66,21 @@ def cnn_model(data):
     param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
     scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
 
-    predict = fluid.layers.fc(
-        input=conv_pool_2,
-        size=SIZE,
-        act="softmax",
-        param_attr=fluid.param_attr.ParamAttr(
-            initializer=fluid.initializer.Constant(value=0.01)))
+    with fluid.device_guard("gpu:1"):
+        predict = fluid.layers.fc(
+            input=conv_pool_2,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        # To cover @RENAMED@GRADIENT
+        predict2 = fluid.layers.fc(
+            input=conv_pool_1,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        predict += predict2
     return predict
 
 
@@ -108,24 +117,37 @@ class TestDistMnist2x2(TestDistRunnerBase):
         bd = [steps_per_pass * p for p in passes]
         lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
         lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
-        opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9)
-
-        # Reader
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
-        test_reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=batch_size)
+        opt = fluid.optimizer.Momentum(
+            learning_rate=lr_val,
+            momentum=0.9,
+            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
 
+        acc_steps = 2  # accumulated steps for pipeline
         if dist_strategy:
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
             fleet.init(is_collective=True)
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
-            strategy.pipeline_configs = {'micro_batch_size': batch_size, }
+            strategy.amp = True
+            strategy.pipeline_configs = {
+                'micro_batch_size': batch_size,
+                'schedule_mode': '1F1B',
+                'accumulate_steps': acc_steps
+            }
             dist_opt = fleet.distributed_optimizer(
                 optimizer=opt, strategy=strategy)
             dist_opt.minimize(avg_cost)
         else:
             opt.minimize(avg_cost)
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
 
         if dist_strategy:
             return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
index d8d28ac1093c7503940ae4f5c8743e40a6409900..41b3ad34103c57150e134840d46d3ced3f609ba0 100644
--- a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py
@@ -122,6 +122,10 @@ class TestDistMnist2x2(TestDistRunnerBase):
         if dist_strategy:
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
+            strategy.pipeline_configs = {
+                'schedule_mode': 'F-then-B',
+                'micro_batch_size': batch_size
+            }
             dist_opt = fleet.distributed_optimizer(
                 optimizer=opt, strategy=strategy)
             dist_opt.minimize(avg_cost)
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py
new file mode 100644
index 0000000000000000000000000000000000000000..0712d5be23e4b3fecbd2b00b10784478c939de5c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from unittest import TestCase
+
+
+def create_model():
+    hidden_size = 32
+    bilstm = paddle.nn.LSTM(
+        hidden_size, hidden_size, num_layers=1, direction='bidirectional')
+    return bilstm
+
+
+class TestRNNProgramClone(TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+    def test_rnn_with_cudnn_clone(self):
+        train_program = paddle.static.Program()
+        test_program = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+
+        # test a typical case in static graph usage: create two nearly
+        # identical program with a shared startup program to share their
+        # parameters
+        # 
+        # when creating a parameter, the name is checked. If there is already
+        # a parameter with the same name, which is the output of a operator
+        # (i.e. its creator), its re-creation is skipped.
+        # 
+        # but if that parameter has been the output of more than one operator,
+        # an exception is raised. For special cases, white list is added.
+        # flattening rnn's parameters for the need to call cudnn kernel is such 
+        # a case.
+        with paddle.static.program_guard(train_program, startup_prog):
+            with paddle.fluid.unique_name.guard():
+                bilstm = create_model()
+
+        with paddle.fluid.program_guard(test_program, startup_prog):
+            with paddle.fluid.unique_name.guard():
+                bilstm = create_model()
diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
index 92146820da17243f4345bc3576ee0de79d7b7215..cb92a68bde638d0fc9c1e7e76ef9e00788cd849a 100644
--- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py
@@ -28,10 +28,10 @@ class TestSequenceSoftmaxOp(OpTest):
         self.op_type = "sequence_softmax"
         self.use_cudnn = False
         self.init_op_type()
-
-        x = np.random.uniform(0.1, 1, (110, 1)).astype("float64")
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
+        x = np.random.uniform(0.1, 1, (110, 1)).astype(self.dtype)
         self.init_lod()
-        out = np.zeros((110, 1)).astype("float64")
+        out = np.zeros((110, 1)).astype(self.dtype)
         offset = 0
         for i in range(len(self.lod[0])):
             if (self.lod[0][i] == 0):
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 2e4b1828c5bbe67f2fb5ba76183138bb152f4963..1f02562dcb4fbe2783dc339a7db64229e82ca44e 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -173,7 +173,7 @@ model = SE_ResNeXt50Small
 def batch_size(use_device):
     if use_device == DeviceType.CUDA:
         # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
-        return 8
+        return 4
     return 12
 
 
diff --git a/python/paddle/fluid/tests/unittests/spawn_runner_base.py b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
index 278d7b27c528803211e21ae7b1f1190e3053bcc4..2719e28fea08b0921e4c1856eef1073ab147b4e0 100644
--- a/python/paddle/fluid/tests/unittests/spawn_runner_base.py
+++ b/python/paddle/fluid/tests/unittests/spawn_runner_base.py
@@ -27,6 +27,7 @@ from test_dist_base import RUN_STEP
 class SpawnAssistTestArgs(object):
     update_method = "local"
     trainer_id = 0
+    find_unused_parameters = False
 
 
 class TestDistSpawnRunner(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
new file mode 100644
index 0000000000000000000000000000000000000000..416f6bc4f0d417db6ae82380787f2a715b398ca6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_col = 0 if rank == 0 else OUT_SIZE // 2
+        np_weight_part = np_weight[:, start_col:start_col + OUT_SIZE // 2]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=1,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a98792f8a0473ec855dccb0d06c7c5751e72f41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_row = 0 if rank == 0 else IN_SIZE // 2
+        np_weight_part = np_weight[start_row:start_row + IN_SIZE // 2, :]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=0,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a98792f8a0473ec855dccb0d06c7c5751e72f41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_row = 0 if rank == 0 else IN_SIZE // 2
+        np_weight_part = np_weight[start_row:start_row + IN_SIZE // 2, :]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=0,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_Tensor_type.py b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..59395b94279ea7ec4fe43221deede7e82be8f38e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+
+class TensorTypeTest(unittest.TestCase):
+    def test_type_totensor(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = paddle.to_tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+    def test_type_Tensor(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = paddle.Tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+        tensorx = paddle.tensor.logic.Tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+    def test_type_core(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = core.VarBase(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+        tensorx = paddle.framework.VarBase(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index cfa487a8354cf839676f6fdc35a196dbcf40568d..6c35d445b43b7beec5d4d58d29adecffc9a0325c 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 
 import paddle.fluid as fluid
+import paddle
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
@@ -25,6 +26,28 @@ import gradient_checker
 from decorator_helper import prog_scope
 
 
+class TestTanhDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [2, 3, 7, 9]
+        eps = 0.0005
+        dtype = np.float64
+        x = layers.data('x', shape, False, dtype=dtype)
+        x.persistable = True
+        y = paddle.tanh(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestReluDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index bcf80fa4771d364c00bb9adff9a68d938b724aba..31589ca4ae38e83307411ecbc39e8ea815987f03 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -119,6 +119,72 @@ class TestSigmoid(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+class TestSilu(TestActivation):
+    def setUp(self):
+        self.op_type = "silu"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = x / (np.exp(-x) + 1)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out')
+
+
+class TestSiluAPI(unittest.TestCase):
+    # test paddle.nn.Silu, paddle.nn.functional.silu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', [11, 17])
+            out1 = F.silu(x)
+            m = paddle.nn.Silu()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = self.x_np / (1 + np.exp(-self.x_np))
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.silu(x)
+        m = paddle.nn.Silu()
+        out2 = m(x)
+        out_ref = self.x_np / (1 + np.exp(-self.x_np))
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.silu, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[11, 17], dtype='int32')
+            self.assertRaises(TypeError, F.silu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[11, 17], dtype='float16')
+            F.silu(x_fp16)
+
+
 class TestLogSigmoid(TestActivation):
     def setUp(self):
         self.op_type = "logsigmoid"
@@ -1971,9 +2037,9 @@ class TestPow_factor_tensor(TestActivation):
             feed={"x": input},
             fetch_list=[out_1, out_2, res, out_6])
 
-        assert np.array_equal(res_1, np.power(input, 2))
-        assert np.array_equal(res_2, np.power(input, 3))
-        assert np.array_equal(res_6, np.power(input, 3))
+        assert np.allclose(res_1, np.power(input, 2))
+        assert np.allclose(res_2, np.power(input, 3))
+        assert np.allclose(res_6, np.power(input, 3))
 
     def test_error(self):
         in1 = fluid.layers.data(
@@ -2629,6 +2695,7 @@ def create_test_act_fp16_class(parent,
 
 create_test_act_fp16_class(TestActivation)
 create_test_act_fp16_class(TestSigmoid)
+create_test_act_fp16_class(TestSilu)
 create_test_act_fp16_class(TestLogSigmoid)
 create_test_act_fp16_class(TestTanh)
 create_test_act_fp16_class(TestTanhshrink)
@@ -2651,7 +2718,7 @@ create_test_act_fp16_class(TestRelu)
 create_test_act_fp16_class(TestGelu)
 create_test_act_fp16_class(TestBRelu)
 create_test_act_fp16_class(TestRelu6)
-create_test_act_fp16_class(TestSoftRelu)
+create_test_act_fp16_class(TestSoftRelu, grad_atol=0.85)
 create_test_act_fp16_class(TestELU)
 create_test_act_fp16_class(TestReciprocal)
 create_test_act_fp16_class(TestLog)
@@ -2669,7 +2736,7 @@ create_test_act_fp16_class(TestSoftplus)
 create_test_act_fp16_class(TestSoftsign)
 create_test_act_fp16_class(TestThresholdedRelu)
 create_test_act_fp16_class(TestHardSigmoid)
-create_test_act_fp16_class(TestSwish)
+create_test_act_fp16_class(TestSwish, grad_atol=0.85)
 create_test_act_fp16_class(TestHardSwish)
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index f337e0079e7d93ec81b433a9284980f2e97e37c9..cb646ef0b9321373327e50c54826ebda8fd45fb0 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -402,6 +402,54 @@ class TestAdamOpBetaVariable(OpTest):
         self.check_output()
 
 
+class TestAdamOpBetaEpsilonVariable(OpTest):
+    def setUp(self):
+        '''Test Adam Op with beta as Variable
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+        beta1 = 0.85
+        beta2 = 0.95
+
+        learning_rate = 0.001
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            "Beta1Tensor": np.array([beta1]).astype("float32"),
+            "Beta2Tensor": np.array([beta2]).astype("float32"),
+            "EpsilonTensor": np.array([epsilon]).astype("float32"),
+        }
+
+        attributes = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAdamOpV2(unittest.TestCase):
     def test_adam_op(self):
         place = fluid.CPUPlace()
@@ -531,5 +579,121 @@ class TestAdamOpV2(unittest.TestCase):
             adam.step()
 
 
+class TestNetWithEpsilonTensor(unittest.TestCase):
+    def _test(self, place, use_tensor=True, use_fluid_api=True):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        SEED = 2021
+        paddle.seed(SEED)
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            beta1_init = 0.9
+            beta2_init = 0.999
+            epsilon_init = 1e-8
+            if use_tensor:
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1],
+                    value=float(beta1_init),
+                    dtype='float32',
+                    persistable=True,
+                    name="beta1")
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1],
+                    value=float(beta2_init),
+                    dtype='float32',
+                    persistable=True,
+                    name="beta2")
+                epsilon = fluid.layers.create_global_var(
+                    shape=[1],
+                    value=float(epsilon_init),
+                    dtype='float32',
+                    persistable=True,
+                    name="epsilon")
+                if use_fluid_api:
+                    adam = fluid.optimizer.Adam(
+                        learning_rate=0.01,
+                        beta1=beta1,
+                        beta2=beta2,
+                        epsilon=epsilon)
+                else:
+                    adam = paddle.optimizer.Adam(
+                        learning_rate=0.01,
+                        beta1=beta1,
+                        beta2=beta2,
+                        epsilon=epsilon)
+            else:
+                if use_fluid_api:
+                    adam = fluid.optimizer.Adam(
+                        learning_rate=0.01,
+                        beta1=beta1_init,
+                        beta2=beta2_init,
+                        epsilon=epsilon_init)
+                else:
+                    adam = fluid.optimizer.Adam(
+                        learning_rate=0.01,
+                        beta1=beta1_init,
+                        beta2=beta2_init,
+                        epsilon=epsilon_init)
+
+            adam.minimize(loss)
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(10):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+
+        print("Epoch {} | Prediction[0]: {}, Loss: {}".format(epoch, pred_res[
+            0], loss_res))
+        paddle.disable_static()
+        return pred_res, loss_res
+
+    def _test_with_place(self, place):
+        preds = []
+        losses = []
+
+        for use_tensor in [True, False]:
+            for use_fluid_api in [True, False]:
+                pred, loss = self._test(place, use_tensor, use_fluid_api)
+                preds.append(pred)
+                losses.append(loss)
+        for pred in preds:
+            self.assertTrue(np.allclose(pred, preds[0]))
+        for loss in losses:
+            self.assertTrue(np.allclose(loss, losses[0]))
+
+    def test_adam_api(self):
+        # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly.
+        self._test_with_place(paddle.CPUPlace())
+        if core.is_compiled_with_cuda():
+            self._test_with_place(paddle.CUDAPlace(0))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index e4336ab05d58f70baf808151efb47a36ed1cdc13..8277256009e72be1a67ef5439c0c2a503c0870f2 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -83,6 +83,8 @@ class TestAffineGridOpCase1(TestAffineGridOp):
         self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
         self.dynamic_shape = True
         self.use_cudnn = True
+        if paddle.fluid.core.is_compiled_with_rocm():
+            self.use_cudnn = False  # ROCM platform do not have MIOPEN kernel for affine_grid
         self.align_corners = True
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
new file mode 100644
index 0000000000000000000000000000000000000000..68cb075b90c3a1886c7464ee337739b70a1b2e23
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+curr_host_ip=`hostname -i`
+python hccl_tools.py --device_num "[0,4)" --server_ip ${curr_host_ip}
+
+export RANK_TABLE_FILE="${PWD}/hccl_4p_0123_${curr_host_ip}.json"
+
+# use ascend
+echo "begin test use ascend npu"
+
+distributed_args="--run_mode=collective --log_dir=testlog"
+python -m paddle.distributed.fleet.launch ${distributed_args} \
+  ascend_group.py fleetascendgroup
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index 82ddafb8f956f4e7ed5252f69e736e6912f64b10..fe82b23b73bdb23da2dc30a083ac91f94a5ed1fd 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -94,10 +94,8 @@ class TestAssignOpError(unittest.TestCase):
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
             self.assertRaises(TypeError, fluid.layers.assign, x3)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            x4 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, fluid.layers.assign, x4)
-            x5 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, fluid.layers.assign, x5)
 
 
 class TestAssignOApi(unittest.TestCase):
@@ -157,6 +155,23 @@ class TestAssignOApi(unittest.TestCase):
             paddle.assign(array, result1)
         self.assertTrue(np.allclose(result1.numpy(), array))
 
+    def test_assign_List(self):
+        paddle.disable_static()
+        l = [1, 2, 3]
+        result = paddle.assign(l)
+        self.assertTrue(np.allclose(result.numpy(), np.array(l)))
+        paddle.enable_static()
+
+    def test_assign_BasicTypes(self):
+        paddle.disable_static()
+        result1 = paddle.assign(2)
+        result2 = paddle.assign(3.0)
+        result3 = paddle.assign(True)
+        self.assertTrue(np.allclose(result1.numpy(), np.array([2])))
+        self.assertTrue(np.allclose(result2.numpy(), np.array([3.0])))
+        self.assertTrue(np.allclose(result3.numpy(), np.array([1])))
+        paddle.enable_static()
+
 
 class TestAssignOpErrorApi(unittest.TestCase):
     def test_errors(self):
@@ -169,10 +184,8 @@ class TestAssignOpErrorApi(unittest.TestCase):
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
             self.assertRaises(TypeError, paddle.assign, x3)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            x4 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, paddle.assign, x4)
-            x5 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, paddle.assign, x5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed21549b7e01fddcf30c8a53f1a96081cb3bb8f0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
@@ -0,0 +1,56 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAssign(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "assign"
+        self.init_dtype()
+
+        x = np.random.random([3, 3]).astype(self.dtype)
+        self.inputs = {'X': x}
+
+        self.attrs = {}
+        self.outputs = {'Out': x}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 31879dae0dad06d75a1ad5c6b6780ef2c3d2b93b..27c8869b21d827d864eb3bc12d22fd2ecb6077da 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -331,5 +331,74 @@ class TestModifiedBuffer(unittest.TestCase):
                 np.array_equal(dy_outs[i].numpy(), st_outs[i].numpy()))
 
 
+class TestLayerTo(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.linear = paddle.nn.Linear(2, 2)
+        self.new_grad = np.random.random([2, 2])
+        self.linear.weight._set_grad_ivar(paddle.to_tensor(self.new_grad))
+        buffer = paddle.to_tensor([0.0], dtype='float32')
+        self.linear.register_buffer("buf_name", buffer, persistable=True)
+
+        sublayer = paddle.nn.Conv1D(3, 2, 3)
+        self.linear.add_sublayer(1, sublayer)
+
+    def test_to_api(self):
+        self.linear.to(dtype='double')
+        self.assertEqual(self.linear.weight.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(self.linear.buf_name.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertTrue(
+            np.allclose(self.linear.weight.grad.numpy(), self.new_grad))
+        self.assertTrue(self.linear.weight._grad_ivar().dtype,
+                        paddle.fluid.core.VarDesc.VarType.FP64)
+
+        self.linear.to()
+        self.assertEqual(self.linear.weight.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertEqual(self.linear.buf_name.dtype,
+                         paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertTrue(
+            np.allclose(self.linear.weight.grad.numpy(), self.new_grad))
+        self.assertTrue(self.linear.weight._grad_ivar().dtype,
+                        paddle.fluid.core.VarDesc.VarType.FP64)
+
+        if paddle.fluid.is_compiled_with_cuda():
+            self.linear.to(device=paddle.CUDAPlace(0))
+            self.assertTrue(self.linear.weight.place.is_gpu_place())
+            self.assertEqual(self.linear.weight.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.buf_name.place.is_gpu_place())
+            self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place(
+            ))
+            self.assertEqual(
+                self.linear.weight._grad_ivar().place.gpu_device_id(), 0)
+
+            self.linear.to(device='gpu:0')
+            self.assertTrue(self.linear.weight.place.is_gpu_place())
+            self.assertEqual(self.linear.weight.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.buf_name.place.is_gpu_place())
+            self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0)
+            self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place(
+            ))
+            self.assertEqual(
+                self.linear.weight._grad_ivar().place.gpu_device_id(), 0)
+
+        self.linear.to(device=paddle.CPUPlace())
+        self.assertTrue(self.linear.weight.place.is_cpu_place())
+        self.assertTrue(self.linear.buf_name.place.is_cpu_place())
+        self.assertTrue(self.linear.weight._grad_ivar().place.is_cpu_place())
+
+        self.linear.to(device='cpu')
+        self.assertTrue(self.linear.weight.place.is_cpu_place())
+        self.assertTrue(self.linear.buf_name.place.is_cpu_place())
+        self.assertTrue(self.linear.weight._grad_ivar().place.is_cpu_place())
+
+        self.assertRaises(ValueError, self.linear.to, device=1)
+
+        self.assertRaises(AssertionError, self.linear.to, blocking=1)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index ee69a37f943a2e73b07b6a827963c9af47617315..6a6f85a48320681b430ab9d6a9363c28cf5c912e 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -195,7 +195,13 @@ class TestBatchNormChannelLast(unittest.TestCase):
                 channel_first_x = paddle.transpose(x, [0, 2, 1])
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 1])
-                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+                if core.is_compiled_with_rocm():
+                    # HIP will fail if no atol
+                    self.assertEqual(
+                        np.allclose(
+                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                else:
+                    self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
     def test_2d(self):
         for p in self.places:
@@ -209,7 +215,13 @@ class TestBatchNormChannelLast(unittest.TestCase):
                 channel_first_x = paddle.transpose(x, [0, 3, 1, 2])
                 y2 = net2(channel_first_x)
                 y2 = paddle.transpose(y2, [0, 2, 3, 1])
-                self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
+                if core.is_compiled_with_rocm():
+                    # HIP will fail if no atol
+                    self.assertEqual(
+                        np.allclose(
+                            y1.numpy(), y2.numpy(), atol=1e-07), True)
+                else:
+                    self.assertEqual(np.allclose(y1.numpy(), y2.numpy()), True)
 
     def test_3d(self):
         for p in self.places:
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index 4b39436842b8970515d6a3b9715b4453518357e0..ea1a22780f0931395662536457c232e72dbf8aff 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -27,8 +27,10 @@ def test_static_layer(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(
+            name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         if weight_np is not None:
             weight = paddle.fluid.data(
                 name='weight', shape=weight_np.shape, dtype='float64')
@@ -58,8 +60,10 @@ def test_static_functional(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(
+            name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         if weight_np is not None:
             weight = paddle.fluid.data(
                 name='weight', shape=weight_np.shape, dtype='float64')
diff --git a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
index a6175aa471d69321ec41b60075ba8bc34c883af5..153b8fd3e7f6b0cba199455c28df1ed3cc46ff94 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
@@ -48,8 +48,10 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(
+            name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         pos_weight = None
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
index ba9db2c104f18c3af969570e4e6c93e2d14c5c60..60e9d0a26b380d78e7b1566b2aa01a8be958fbcf 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
@@ -42,11 +42,12 @@ class TestBilinearTensorProductOp(OpTest):
         size0 = 5
         size1 = 4
         size2 = 5
-        a = np.random.random((batch_size, size0)).astype("float64")
-        b = np.random.random((batch_size, size1)).astype("float64")
-        w = np.random.random((size2, size0, size1)).astype("float64")
-        bias = np.random.random((1, size2)).astype("float64")
-        output = np.zeros((batch_size, size2)).astype("float64")
+        dtype = "float32" if fluid.core.is_compiled_with_rocm() else "float64"
+        a = np.random.random((batch_size, size0)).astype(dtype)
+        b = np.random.random((batch_size, size1)).astype(dtype)
+        w = np.random.random((size2, size0, size1)).astype(dtype)
+        bias = np.random.random((1, size2)).astype(dtype)
+        output = np.zeros((batch_size, size2)).astype(dtype)
         for i in range(size2):
             w_i = w[i, :, :]
             output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
index a9d450e223f1e0bc84513f7069e0a104fa644e7c..aba95a68ab790828a63b1f46afaaa87f7826d248 100644
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
diff --git a/python/paddle/fluid/tests/unittests/test_c_concat.py b/python/paddle/fluid/tests/unittests/test_c_concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..20f166af14c9ced598fd7946c6c5780d2b52657a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_concat.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestConcatOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_concat(self, col_type="concat"):
+        self.check_with_place("collective_concat_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_identity.py b/python/paddle/fluid/tests/unittests/test_c_identity.py
new file mode 100644
index 0000000000000000000000000000000000000000..c780f800d1ed53a142f3a42bd2322e23c87a68b8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_identity.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestIdentityOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_identity(self, col_type="identity"):
+        self.check_with_place("collective_identity_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_split.py b/python/paddle/fluid/tests/unittests/test_c_split.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a5d91e0625e217f82c9a97b85c8c33b8147121a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_split.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestSplitOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_split(self, col_type="split"):
+        self.check_with_place("collective_split_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 2946798a82f78fc55adb6ff0bb3c7f9721a6d60f..1833c473d18a967b715bea351ab6b24a23f4bd04 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -50,10 +50,14 @@ class TestClipOp(OpTest):
         self.outputs = {'Out': np.clip(self.inputs['X'], min_v, max_v)}
 
     def test_check_output(self):
+        paddle.enable_static()
         self.check_output()
+        paddle.disable_static()
 
     def test_check_grad_normal(self):
+        paddle.enable_static()
         self.check_grad(['X'], 'Out')
+        paddle.disable_static()
 
     def initTestCase(self):
         self.shape = (4, 10, 10)
@@ -102,6 +106,7 @@ class TestCase5(TestClipOp):
 
 class TestClipOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             input_data = np.random.random((2, 4)).astype("float32")
 
@@ -115,9 +120,13 @@ class TestClipOpError(unittest.TestCase):
                 fluid.layers.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
+        paddle.disable_static()
 
 
 class TestClipAPI(unittest.TestCase):
+    def _executed_api(self, x, min=None, max=None):
+        return paddle.clip(x, min, max)
+
     def test_clip(self):
         paddle.enable_static()
         data_shape = [1, 9, 9, 4]
@@ -130,17 +139,22 @@ class TestClipAPI(unittest.TestCase):
         ) else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
-        out_1 = paddle.clip(images, min=min, max=max)
-        out_2 = paddle.clip(images, min=0.2, max=0.9)
-        out_3 = paddle.clip(images, min=0.3)
-        out_4 = paddle.clip(images, max=0.7)
-        out_5 = paddle.clip(images, min=min)
-        out_6 = paddle.clip(images, max=max)
-        out_7 = paddle.clip(images, max=-1.)
-        out_8 = paddle.clip(images)
-        out_9 = paddle.clip(paddle.cast(images, 'float64'), min=0.2, max=0.9)
-
-        res1, res2, res3, res4, res5, res6, res7, res8, res9 = exe.run(
+        out_1 = self._executed_api(images, min=min, max=max)
+        out_2 = self._executed_api(images, min=0.2, max=0.9)
+        out_3 = self._executed_api(images, min=0.3)
+        out_4 = self._executed_api(images, max=0.7)
+        out_5 = self._executed_api(images, min=min)
+        out_6 = self._executed_api(images, max=max)
+        out_7 = self._executed_api(images, max=-1.)
+        out_8 = self._executed_api(images)
+        out_9 = self._executed_api(
+            paddle.cast(images, 'float64'), min=0.2, max=0.9)
+        out_10 = self._executed_api(
+            paddle.cast(images * 10, 'int32'), min=2, max=8)
+        out_11 = self._executed_api(
+            paddle.cast(images * 10, 'int64'), min=2, max=8)
+
+        res1, res2, res3, res4, res5, res6, res7, res8, res9, res10, res11 = exe.run(
             fluid.default_main_program(),
             feed={
                 "image": data,
@@ -148,7 +162,8 @@ class TestClipAPI(unittest.TestCase):
                 "max": np.array([0.8]).astype('float32')
             },
             fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9,
+                out_10, out_11
             ])
 
         self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
@@ -161,8 +176,14 @@ class TestClipAPI(unittest.TestCase):
         self.assertTrue(np.allclose(res8, data))
         self.assertTrue(
             np.allclose(res9, data.astype(np.float64).clip(0.2, 0.9)))
+        self.assertTrue(
+            np.allclose(res10, (data * 10).astype(np.int32).clip(2, 8)))
+        self.assertTrue(
+            np.allclose(res11, (data * 10).astype(np.int64).clip(2, 8)))
+        paddle.disable_static()
 
     def test_clip_dygraph(self):
+        paddle.disable_static()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         paddle.disable_static(place)
@@ -172,13 +193,24 @@ class TestClipAPI(unittest.TestCase):
         v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32))
         v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32))
 
-        out_1 = paddle.clip(images, min=0.2, max=0.8)
-        out_2 = paddle.clip(images, min=0.2, max=0.9)
-        out_3 = paddle.clip(images, min=v_min, max=v_max)
+        out_1 = self._executed_api(images, min=0.2, max=0.8)
+        images = paddle.to_tensor(data, dtype='float32')
+        out_2 = self._executed_api(images, min=0.2, max=0.9)
+        images = paddle.to_tensor(data, dtype='float32')
+        out_3 = self._executed_api(images, min=v_min, max=v_max)
+
+        out_4 = self._executed_api(
+            paddle.cast(images * 10, 'int32'), min=2, max=8)
+        out_5 = self._executed_api(
+            paddle.cast(images * 10, 'int64'), min=2, max=8)
 
         self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
         self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8)))
+        self.assertTrue(
+            np.allclose(out_4.numpy(), (data * 10).astype(np.int32).clip(2, 8)))
+        self.assertTrue(
+            np.allclose(out_5.numpy(), (data * 10).astype(np.int64).clip(2, 8)))
 
     def test_errors(self):
         paddle.enable_static()
@@ -186,6 +218,12 @@ class TestClipAPI(unittest.TestCase):
         x2 = fluid.data(name='x2', shape=[1], dtype="int8")
         self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
         self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
+        paddle.disable_static()
+
+
+class TestInplaceClipAPI(TestClipAPI):
+    def _executed_api(self, x, min=None, max=None):
+        return x.clip_(min, max)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
index a405da80adaf0f2c3b6698bd175797670a748c62..eed2388f36ffe68ab9e5d1ecdf2201525adeb55d 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -27,8 +27,14 @@ class TestCollectiveAllreduceAPI(TestDistBase):
         pass
 
     def test_allreduce_nccl(self):
-        self.check_with_place("collective_allreduce_api.py", "allreduce",
-                              "nccl")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place("collective_allreduce_api.py", "allreduce",
+                                  "nccl")
+
+    def test_allreduce_bkcl(self):
+        if paddle.fluid.core.is_compiled_with_xpu():
+            self.check_with_place("collective_allreduce_api.py", "allreduce",
+                                  "bkcl")
 
     def test_allreduce_gloo(self):
         self.check_with_place("collective_allreduce_api.py", "allreduce",
diff --git a/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..fab975a9d6249f274952e52bb59fd5f61badc116
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllToAllAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_alltoall_nccl(self):
+        self.check_with_place("collective_alltoall_api.py", "alltoall", "nccl")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 660018e285a85261e531449119af97bc25cf4e6a..e6693b676cf6430b329d83dc5ff8c99bc1f131e9 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -33,7 +33,7 @@ from paddle.fluid import core
 
 
 class TestCollectiveAPIRunnerBase(object):
-    def get_model(self, train_prog, startup_prog, rank):
+    def get_model(self, train_prog, startup_prog, rank, indata=None):
         raise NotImplementedError(
             "get model should be implemented by child class.")
 
@@ -44,24 +44,31 @@ class TestCollectiveAPIRunnerBase(object):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        result = self.get_model(train_prog, startup_prog, rank)
         paddle.distributed.init_parallel_env()
         if args['backend'] == 'nccl':
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
             place = fluid.CUDAPlace(
                 device_id)  #if args.use_gpu else fluid.CPUPlace()
+        elif args['backend'] == 'bkcl':
+            device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
+            place = fluid.XPUPlace(device_id)
         else:
             place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
         np.random.seed(os.getpid())
         indata = np.random.random((10, 1000)).astype("float32")
-        fetch_list = []
-        for elem in result:
-            fetch_list.append(elem.name)
-        out = exe.run(train_prog,
-                      feed={'tindata': indata},
-                      fetch_list=fetch_list)
+        if args['static_mode']:
+            result = self.get_model(train_prog, startup_prog, rank)
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+            fetch_list = []
+            for elem in result:
+                fetch_list.append(elem.name)
+            out = exe.run(train_prog,
+                          feed={'tindata': indata},
+                          fetch_list=fetch_list)
+        else:
+            out = self.get_model(train_prog, startup_prog, rank, indata)
+            #print(out, sys.stderr)
         if six.PY2:
             print(pickle.dumps(out))
         else:
@@ -71,7 +78,6 @@ class TestCollectiveAPIRunnerBase(object):
 def runtime_main(test_class, col_type):
     args = {}
     model = test_class()
-    args["deviceid"] = os.getenv("FLAGS_selected_gpus")
     args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
     args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
     args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
@@ -79,6 +85,7 @@ def runtime_main(test_class, col_type):
     args["col_type"] = col_type
     args["backend"] = os.getenv("BACKEND")
     args["path_id"] = int(os.getenv("PATH_ID"))
+    args["static_mode"] = int(os.getenv("STATIC_MODE"))
     model.run_trainer(args)
 
 
@@ -112,21 +119,38 @@ class TestDistBase(unittest.TestCase):
         worker_endpoints = self._ps_endpoints.split(",")
         w0_ep, w1_ep = worker_endpoints
         #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
-        env0 = {
-            "FLAGS_selected_gpus": "0",
-            "PADDLE_TRAINER_ID": "0",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
-        }
+        if core.is_compiled_with_cuda():
+            env0 = {
+                "FLAGS_selected_gpus": "0",
+                "PADDLE_TRAINER_ID": "0",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w0_ep
+            }
 
-        env1 = {
-            "FLAGS_selected_gpus": "1",
-            "PADDLE_TRAINER_ID": "1",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
-        }
+            env1 = {
+                "FLAGS_selected_gpus": "1",
+                "PADDLE_TRAINER_ID": "1",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w1_ep
+            }
+        elif core.is_compiled_with_xpu():
+            env0 = {
+                "FLAGS_selected_xpus": "0",
+                "PADDLE_TRAINER_ID": "0",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w0_ep
+            }
+
+            env1 = {
+                "FLAGS_selected_xpus": "1",
+                "PADDLE_TRAINER_ID": "1",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w1_ep
+            }
         #update environment
         env0.update(envs)
         env1.update(envs)
@@ -167,9 +191,13 @@ class TestDistBase(unittest.TestCase):
                          col_type,
                          backend="nccl",
                          path_id="0",
+                         static_mode="1",
                          check_error_log=False,
                          need_envs={}):
-        with_gloo = '0' if backend == "nccl" else '1'
+        if backend == "nccl" or backend == "bkcl":
+            with_gloo = '0'
+        else:
+            with_gloo = '1'
         required_envs = {
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
             "FLAGS_eager_delete_tensor_gb": "0.0",
@@ -177,8 +205,10 @@ class TestDistBase(unittest.TestCase):
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
-            "GLOG_v": "0",
+            "FLAGS_call_stack_level": "2",
+            "GLOG_v": "3",
             "NCCL_P2P_DISABLE": "1",
+            "STATIC_MODE": static_mode,
             "PADDLE_WITH_GLOO": with_gloo,
             "BACKEND": backend,
             "PATH_ID": path_id
@@ -247,5 +277,23 @@ class TestDistBase(unittest.TestCase):
             self.assertTrue(
                 np.allclose(
                     result_data, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "alltoall":
+            need_result1 = np.vstack((input1[0:input1.shape[0] // 2, :],
+                                      input2[0:input2.shape[0] // 2, :]))
+            need_result2 = np.vstack((input1[input1.shape[0] // 2:, :],
+                                      input2[input2.shape[0] // 2:, :]))
+            tr0_out = np.vstack(tr0_out)
+            tr1_out = np.vstack(tr1_out)
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
+        elif col_type == "sendrecv":
+            result_data = tr1_out[0]
+            self.assertTrue(
+                np.allclose(
+                    input1, result_data, rtol=1e-05, atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
index fc267ed914ec290158adf7b1a5da507a4daf5b46..697e8d32d67a8809ab8047ee89e6aa412397962e 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
@@ -274,6 +274,11 @@ class TestDistBase(unittest.TestCase):
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "identity":
+            need_result1 = input1
+            need_result2 = input2
+            self.assertTrue(np.allclose(tr0_out, need_result1, rtol=0, atol=0))
+            self.assertTrue(np.allclose(tr1_out, need_result2, rtol=0, atol=0))
         elif col_type == "reduce_slicegather":
             slicesize = input1.shape[0] // 2
             tmp10 = input1[0:slicesize]
@@ -284,5 +289,22 @@ class TestDistBase(unittest.TestCase):
             need_result2 = np.concatenate((tmp20, tmp21), axis=1)
             self.assertTrue(np.allclose(tr0_out, need_result1))
             self.assertTrue(np.allclose(tr1_out, need_result2))
+        elif col_type == "concat":
+            need_result = np.concatenate((input1, input2), axis=1)
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "split":
+            need_result1 = np.split(input1, 2, axis=1)[0]
+            need_result2 = np.split(input2, 2, axis=1)[1]
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
index 8d28c794f023a6945893342a53386f6ffb8a6052..721f446c9f09462e622811df81a989928b1509f4 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -27,7 +27,12 @@ class TestCollectiveReduceAPI(TestDistBase):
         pass
 
     def test_reduce_nccl(self):
-        self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+
+    def test_reduce_bkcl(self):
+        if paddle.fluid.core.is_compiled_with_xpu():
+            self.check_with_place("collective_reduce_api.py", "reduce", "bkcl")
 
     def test_reduce_gloo(self):
         self.check_with_place("collective_reduce_api.py", "reduce", "gloo", "1")
diff --git a/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1d5ec1300e0e1349b77bf0c4c49d9f90227d3b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveSendRecvAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    #def test_sendrecv_nccl(self):
+    #    if paddle.fluid.core.is_compiled_with_cuda():
+    #        self.check_with_place("collective_sendrecv_api.py", "sendrecv",
+    #                              "nccl")
+
+    def test_sendrecv_nccl_dygraph(self):
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "collective_sendrecv_api_dygraph.py",
+                "sendrecv",
+                "nccl",
+                static_mode='0')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b956d5031fec7c0abdffe220a786b30999ab06f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
@@ -0,0 +1,98 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import time
+import threading
+import numpy
+
+import paddle
+paddle.enable_static()
+
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+
+class TestCommunicator(unittest.TestCase):
+    def test_communicator_ps_gpu(self):
+        with open("test_communicator_ps_gpu.txt", "w") as f:
+            data = "1 0.6 1 0.7\n"
+            f.write(data)
+
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002"
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["FLAGS_selected_gpus"] = "0"
+        role = role_maker.PaddleCloudRoleMaker()
+
+        fleet.init(role)
+        x = fluid.layers.data(name='x', shape=[1], dtype='float32')
+        y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+        slots_vars = [x, y]
+
+        cost = fluid.layers.square_error_cost(input=x, label=y)
+        avg_cost = fluid.layers.mean(cost)
+
+        optimizer = fluid.optimizer.Adam(0.01)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
+        strategy.a_sync_configs = {
+            "launch_barrier": False,
+            "use_ps_gpu": 1,
+        }
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        optimizer = fleet.distributed_optimizer(optimizer, strategy)
+        optimizer.minimize(avg_cost)
+
+        dataset = paddle.distributed.InMemoryDataset()
+        dataset.init(
+            batch_size=32, thread_num=1, pipe_command="cat", use_var=slots_vars)
+        dataset.set_filelist(["test_communicator_ps_gpu.txt"])
+        dataset._set_use_ps_gpu(1)
+        dataset.load_into_memory()
+
+        os.environ["TEST_MODE"] = "1"
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(startup_program)
+        main_program._fleet_opt = {"stat_var_names": [x.name]}
+        fleet.init_worker()
+
+        try:
+            exe.train_from_dataset(main_program, dataset)
+        except ImportError as e:
+            pass
+        except Exception as e:
+            self.assertTrue(False)
+
+        time.sleep(10)
+        fleet.stop_worker()
+        os.remove("./test_communicator_ps_gpu.txt")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index fbf7384b86bc1c844dee09c5b439a523026044e5..8dc80c893126925ff0643b4cde622fe504c6b1d9 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -122,6 +122,23 @@ def create_paddle_case(op_type, callback):
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
+        def test_broadcast_api_2(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 1, 3], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
+                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
         def test_attr_name(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py
index ad5420b92c092d7f7456bdfc5dc3f86d0287578c..0470a2df35f68f918684d9191827e523aa6767fa 100644
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
@@ -96,7 +96,7 @@ class TestCondInputOutput(unittest.TestCase):
         self.assertTrue(
             np.allclose(np.asarray(ret[0]), np.full((1, 2), 1, np.int32)))
         self.assertTrue(
-            np.allclose(np.asarray(ret[1]), np.full((2, 3), True, np.bool)))
+            np.allclose(np.asarray(ret[1]), np.full((2, 3), True, bool)))
 
     def test_pass_and_modify_var(self):
         """
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 9992efee1b3053fec7515dd7ce063499af22ca16..77eac2fbd7fe04fcfd145558fe67c521136c776e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -128,6 +128,8 @@ def create_test_cudnn_class(parent):
     class TestCUDNNCase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
 
     cls_name = "{0}_{1}".format(parent.__name__, "CUDNN")
     TestCUDNNCase.__name__ = cls_name
@@ -185,6 +187,8 @@ def create_test_cudnn_channel_last_class(parent):
     class TestCudnnChannelLastCase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
 
         def init_data_format(self):
             self.data_format = "NHWC"
@@ -264,6 +268,8 @@ def create_test_cudnn_padding_SAME_class(parent):
     class TestCUDNNPaddingSMAECase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
 
         def init_paddings(self):
             self.pad = [1, 1]
@@ -280,6 +286,8 @@ def create_test_cudnn_padding_VALID_class(parent):
     class TestCUDNNPaddingVALIDCase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
 
         def init_paddings(self):
             self.pad = [1, 1]
@@ -299,8 +307,7 @@ class TestConv2DOp(OpTest):
         self.use_mkldnn = False
         self.fuse_relu_before_depthwise_conv = False
         self.data_format = "AnyLayout"
-        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
-        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.dtype = np.float64
         self.init_kernel_type()
         self.init_group()
         self.init_dilation()
@@ -693,6 +700,7 @@ class TestCUDNNExhaustiveSearch(TestConv2DOp):
     def init_kernel_type(self):
         self.use_cudnn = True
         self.exhaustive_search = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
 
 class TestConv2DOpError(unittest.TestCase):
@@ -734,8 +742,7 @@ class TestConv2DOp_v2(OpTest):
         self.use_cuda = False
         self.use_mkldnn = False
         self.fuse_relu_before_depthwise_conv = False
-        # explicilty use float32 for ROCm, as MIOpen does not yet support float64
-        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.dtype = np.float64
         self.init_kernel_type()
         self.init_group()
         self.init_dilation()
@@ -1241,6 +1248,17 @@ create_test_cudnn_channel_last_class(TestWithStride_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding)
 
+# ------------ depthwise conv2d in MIOPEN ---------
+if core.is_compiled_with_rocm():
+    create_test_cudnn_padding_SAME_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_padding_SAME_class(
+        TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_cudnn_channel_last_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_channel_last_class(
+        TestDepthwiseConvWithDilation2_AsyPadding)
+
 create_test_cudnn_channel_last_fp16_class(
     TestConv2DOp_AsyPadding, grad_check=False)
 create_test_cudnn_channel_last_fp16_class(
@@ -1458,5 +1476,65 @@ class TestConv2DAPI_Error(unittest.TestCase):
         self.assertRaises(ValueError, run_7)
 
 
+# --------- test environment variable ------
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()),
+    "core is not compiled with CUDA or ROCM")
+class TestConv2DEnviron(unittest.TestCase):
+    def run1(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            inputs = fluid.layers.data(
+                shape=[2, 3, 5, 5],
+                append_batch_size=False,
+                name="inputs",
+                dtype="float32")
+            result = fluid.layers.conv2d(
+                input=inputs,
+                num_filters=4,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=0,
+                dilation=[1, 1],
+                groups=1,
+                data_format="NCHW")
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"inputs": self.input_np},
+                              fetch_list=[result])
+
+    def run2(self, place):
+        with fluid.dygraph.guard(place):
+            inputs = fluid.dygraph.to_variable(self.input_np)
+            conv = paddle.nn.Conv2D(
+                in_channels=3,
+                out_channels=4,
+                kernel_size=(3, 3),
+                data_format="NCHW")
+            result = conv(inputs)
+
+    def run3(self, place):
+        with fluid.dygraph.guard(place):
+            inputs = fluid.dygraph.to_variable(self.input_np)
+            conv = paddle.fluid.dygraph.nn.Conv2D(
+                num_channels=3,
+                num_filters=4,
+                filter_size=(3, 3), )
+            result = conv(inputs)
+
+    def run_all(self, place):
+        self.run1(place)
+        self.run2(place)
+        self.run3(place)
+
+    def test_environ(self):
+        self.input_np = np.random.random([2, 3, 5, 5]).astype("float32")
+        for place in [paddle.CPUPlace(), paddle.CUDAPlace(0)]:
+            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False})
+            self.run_all(place)
+            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True})
+            self.run_all(place)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index fb6058c0f036bbc3d31b8071ace2c676e4433a0c..4e582d74c24a234c5822fdd58a30b4d2f2a0a155 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -116,7 +116,7 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
 class TestConv2DTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.need_check_grad = True
         self.is_test = False
         self.use_cudnn = False
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 1636019a6252ca6f679eab697a0a92708342b190..59d1f3216e17e114b8b51e9cfef62a6ff45663c4 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -135,6 +135,8 @@ def create_test_cudnn_class(parent):
     class TestCUDNNCase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
 
     cls_name = "{0}_{1}".format(parent.__name__, "CUDNN")
     TestCUDNNCase.__name__ = cls_name
@@ -169,6 +171,8 @@ def create_test_cudnn_padding_SAME_class(parent):
     class TestCUDNNPaddingSMAECase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
 
         def init_paddings(self):
             self.pad = [1, 1, 1]
@@ -185,6 +189,8 @@ def create_test_cudnn_padding_VALID_class(parent):
     class TestCUDNNPaddingVALIDCase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
 
         def init_paddings(self):
             self.pad = [1, 1, 1]
@@ -215,6 +221,8 @@ def create_test_cudnn_channel_last_class(parent):
     class TestCudnnChannelLastCase(parent):
         def init_kernel_type(self):
             self.use_cudnn = True
+            self.dtype = np.float32 if core.is_compiled_with_rocm(
+            ) else np.float64
 
         def init_data_format(self):
             self.data_format = "NDHWC"
@@ -410,6 +418,7 @@ class TestWithDilation(TestConv3DOp):
 class TestCUDNN(TestConv3DOp):
     def init_kernel_type(self):
         self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -431,6 +440,7 @@ class TestFP16CUDNN(TestConv3DOp):
 class TestWithGroup1CUDNN(TestWithGroup1):
     def init_kernel_type(self):
         self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -452,6 +462,7 @@ class TestFP16WithGroup1CUDNN(TestWithGroup1):
 class TestWithGroup2CUDNN(TestWithGroup2):
     def init_kernel_type(self):
         self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -473,6 +484,7 @@ class TestFP16WithGroup2CUDNN(TestWithGroup2):
 class TestWith1x1CUDNN(TestWith1x1):
     def init_kernel_type(self):
         self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -494,6 +506,7 @@ class TestFP16With1x1CUDNN(TestWith1x1):
 class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1):
     def init_kernel_type(self):
         self.use_cudnn = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -514,6 +527,7 @@ class TestCUDNNExhaustiveSearch(TestCUDNN):
     def init_kernel_type(self):
         self.use_cudnn = True
         self.exhaustive_search = True
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
 
 
 # ---- test asymmetric padding ----
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index 7aa3d0d16862bd5ac49ab95817ea23d3cc0ef8a7..d5f49919bc951741e2d67e22a76633268ad8582e 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -30,7 +30,7 @@ class TestConvDoubleGradCheck(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(x, 2, 1, groups=1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -57,7 +57,7 @@ class TestConvDoubleGradCheck(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(x, 2, 1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -82,7 +82,7 @@ class TestConvDoubleGradCheckTest1(unittest.TestCase):
     def func(self, place):
         shape = [2, 3, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(x, 2, 1, padding=1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -107,7 +107,7 @@ class TestConv3DDoubleGradCheck(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 3, 4, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(x, 2, 1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -132,7 +132,7 @@ class TestConv3DDoubleGradCheckTest1(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 5, 3, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(x, 2, 1, padding=1, bias_attr=False)
         x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
@@ -157,7 +157,7 @@ class TestConv2DoubleGradCheck_AsyPadding(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -188,7 +188,7 @@ class TestConv2DoubleGradCheck_PaddingSAME(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -219,7 +219,7 @@ class TestConv2DoubleGradCheck_PaddingVALID(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -250,7 +250,7 @@ class TestConv2DoubleGradCheck_ChannelLast(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -283,7 +283,7 @@ class TestConv2DoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d(
             input=x,
@@ -316,7 +316,7 @@ class TestConv3DDoubleGradCheck_AsyPadding(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 2, 2, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -347,7 +347,7 @@ class TestConv3DoubleGradCheck_PaddingSAME(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 2, 2, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -379,7 +379,7 @@ class TestConv3DoubleGradCheck_PaddingVALID(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 3, 3, 2]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -410,7 +410,7 @@ class TestConv3DDoubleGradCheck_ChannelLast(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 2, 2, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -443,7 +443,7 @@ class TestConv3DDoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
     def func(self, place):
         shape = [2, 2, 2, 2, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
         y = layers.conv3d(
             input=x,
@@ -476,7 +476,7 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase):
     def func(self, place):
         shape = [2, 4, 3, 3]
         eps = 0.005
-        dtype = np.float64
+        dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64
         x = layers.data('x', shape, False, dtype)
 
         # condition of depthwise conv: 
diff --git a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
index 110cfc47cae4126dddbb6c3c68c0fe2e3bb42def..a4ef15b1f0db3c3737c063d4471cc54ef24ea074 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
@@ -32,6 +32,8 @@ class TestConvTransposeDoubleGradCheck(unittest.TestCase):
         shape = [2, 4, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             x, 2, filter_size=1, groups=1, bias_attr=False)
@@ -41,8 +43,18 @@ class TestConvTransposeDoubleGradCheck(unittest.TestCase):
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
     def test_grad(self):
         places = []
@@ -60,6 +72,8 @@ class TestConvTranspose2DoubleGradCheck_AsyPadding(
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -74,8 +88,18 @@ class TestConvTranspose2DoubleGradCheck_AsyPadding(
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingSAME(
@@ -85,6 +109,8 @@ class TestConvTranspose2DoubleGradCheck_PaddingSAME(
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -99,8 +125,18 @@ class TestConvTranspose2DoubleGradCheck_PaddingSAME(
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingVALID(
@@ -110,6 +146,8 @@ class TestConvTranspose2DoubleGradCheck_PaddingVALID(
         shape = [2, 2, 3, 3]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -124,8 +162,18 @@ class TestConvTranspose2DoubleGradCheck_PaddingVALID(
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 class TestConvTranspose2DoubleGradCheck_ChannelLast(
@@ -135,6 +183,8 @@ class TestConvTranspose2DoubleGradCheck_ChannelLast(
         shape = [2, 3, 3, 2]
         eps = 0.005
         dtype = np.float64
+        if core.is_compiled_with_rocm():
+            dtype = np.float32
         x = layers.data('x', shape, False, dtype)
         y = layers.conv2d_transpose(
             input=x,
@@ -151,8 +201,18 @@ class TestConvTranspose2DoubleGradCheck_ChannelLast(
         w_arr = []
         for p in w:
             w_arr.append(np.random.uniform(-1, 1, p.shape).astype(dtype))
-        gradient_checker.double_grad_check(
-            [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        if core.is_compiled_with_rocm():
+            # HIP will sometimes fail if no atol
+            gradient_checker.double_grad_check(
+                [x] + w,
+                y,
+                x_init=[x_arr] + w_arr,
+                place=place,
+                eps=eps,
+                atol=1e-4)
+        else:
+            gradient_checker.double_grad_check(
+                [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 81e2160a556d2fddf0e970e5a68315a7ec39f724..731e4b54e22c35ceaf056431429e19f4f47993cb 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -18,6 +18,9 @@ import paddle
 import paddle.fluid as fluid
 import numpy as np
 import unittest
+from test_softmax_op import stable_softmax
+from test_softmax_with_cross_entropy_op import cross_entropy
+from paddle.fluid import Program, program_guard
 
 
 def stable_softmax(x):
@@ -42,6 +45,8 @@ def cross_entropy_loss_1d(input,
     C = input_shape[1]
     out = np.zeros_like(label).astype(np.float64)
     total_weight = 0
+    ###1. compute softmax cross_entropy (with weight)
+    ###   Note: only support hard labels.
     for i in range(N):
         cur_target = label[i]
         if cur_target == ignore_index:
@@ -50,11 +55,13 @@ def cross_entropy_loss_1d(input,
         cur_weight = weight[cur_target] if weight is not None else 1
         total_weight += cur_weight
         out[i] = -log_softmax_out[i][cur_target] * cur_weight
+
+    ###2. deal with reduction 
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
-        return out.sum() / total_weight, np.array(
-            [total_weight]).astype('float64')
+        out = out.sum() / total_weight if total_weight != 0 else out.sum()
+        return out, np.array([total_weight]).astype('float64')
     elif reduction == 'none':
         return out
 
@@ -86,15 +93,638 @@ def cross_entropy_loss_2d(input,
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
-        return out.sum() / total_weight, np.array(
-            [total_weight]).astype('float64')
+        out = out.sum() / total_weight if total_weight != 0 else out.sum()
+        return out, np.array([total_weight]).astype('float64')
     elif reduction == 'none':
         return out
 
 
+def cross_entropy_soft(softmax,
+                       label,
+                       axis,
+                       N,
+                       weight=None,
+                       reduction='mean',
+                       ignore_index=-100):
+    #1.loss
+    loss = cross_entropy(
+        softmax,
+        label,
+        True,  #soft_label,
+        axis,
+        ignore_index)
+
+    if weight is None and reduction == 'none':
+        return loss
+
+    #2.weight
+    weighted_loss = loss
+    total_weight = N  #for weight is None
+    if weight is not None:
+        weighted_loss = np.zeros_like(loss).astype(np.float64)
+        total_weight = 0
+        for i in range(N):
+            cur_soft_label = label[i]
+            cur_weight = np.dot(weight, cur_soft_label)
+            total_weight += cur_weight
+            weighted_loss[i] = loss[i] * cur_weight
+
+    #3.reduce
+    if reduction == 'none':
+        return weighted_loss
+
+    elif reduction == 'mean':
+        weighted_loss_sum = np.sum(weighted_loss)
+        weighted_loss_mean = weighted_loss_sum / total_weight
+        return weighted_loss_mean
+
+    else:
+        weighted_loss_sum = np.sum(weighted_loss)
+        return weighted_loss_sum
+
+
+def cross_entropy_soft_2d(softmax,
+                          label,
+                          axis,
+                          N,
+                          H,
+                          W,
+                          weight=None,
+                          reduction='mean',
+                          ignore_index=-100):
+    #1.loss
+    loss = cross_entropy(
+        softmax,
+        label,
+        True,  #soft_label,
+        axis,
+        ignore_index)
+
+    if weight is None and reduction == 'none':
+        return loss
+
+    #2.weight
+    weighted_loss = loss
+    total_weight = N  #for weight is None
+    if weight is not None:
+        weighted_loss = np.zeros_like(loss).astype(np.float64)
+        total_weight = 0
+        for i in range(N):
+            for h in range(H):
+                for w in range(W):
+                    cur_soft_label = label[i][h][w]
+                    cur_weight = np.dot(weight, cur_soft_label)
+                    total_weight += cur_weight
+                    weighted_loss[i][h][w] = loss[i][h][w] * cur_weight
+
+    #3.reduce
+    if reduction == 'none':
+        return weighted_loss
+
+    elif reduction == 'mean':
+        weighted_loss_sum = np.sum(weighted_loss)
+        weighted_loss_mean = weighted_loss_sum / total_weight
+        return weighted_loss_mean
+
+    else:
+        weighted_loss_sum = np.sum(weighted_loss)
+        return weighted_loss_sum
+
+
 class CrossEntropyLoss(unittest.TestCase):
+    def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+
+    ###test for deprecated softmax_with_cross_entropy
+    def test_softmax_with_cross_entropy(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        paddle.disable_static()
+        paddle_loss_swce = paddle.nn.functional.softmax_with_cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis)
+
+        paddle_loss_ce = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight)
+            if self.weight is not None else None,
+            reduction=self.reduction)
+
+        self.assertTrue(np.allclose(paddle_loss_swce.numpy(), expected))
+        self.assertTrue(np.allclose(paddle_loss_ce.numpy(), expected))
+
+    ###soft_label test start
+    ###soft_label test 1
+    def test_cross_entropy_loss_soft_1d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        #2. dygraph
+        paddle.disable_static()
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight)
+            if self.weight is not None else None,
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype)
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 2
+    def test_cross_entropy_loss_soft_1d_weight(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        if self.soft_label:
+            self.labels = np.random.uniform(0.1, 1.0,
+                                            self.shape).astype(self.dtype)
+            self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+        else:
+            axis_dim = self.shape[self.axis]
+            self.shape[self.axis] = 1
+            self.labels = np.random.randint(
+                0, axis_dim, self.shape, dtype="int64")
+
+        #1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        #2. dygraph
+        paddle.disable_static()
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight),
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        # 3.static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype)
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype)
+            weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                     "weight": self.weight
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 3
+    def test_cross_entropy_loss_soft_1d_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+
+        #2 dygraph 
+        paddle.disable_static()
+        paddle_loss_mean = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=self.weight,
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_mean.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype)
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(
+                prog,
+                feed={'input': self.logits,
+                      'label': self.labels},
+                fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 4
+    def test_cross_entropy_loss_soft_1d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 4
+        self.C = 3
+        self.shape = [self.N, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        #2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight),
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[self.N, self.C], dtype=self.dtype)
+            label = fluid.data(
+                name='label', shape=[self.N, self.C], dtype=self.dtype)
+            weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                     "weight": self.weight
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 5
+    def test_cross_entropy_loss_soft_2d(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'none'
+        self.weight = None
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        #2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight)
+            if self.weight is not None else None,
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype)
+            label = fluid.data(
+                name='label',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test 6
+    def test_cross_entropy_loss_soft_2d_weight_mean(self):
+        self.numeric_stable_mode = False
+        self.soft_label = True
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
+        self.axis = -1
+        self.ignore_index = -100  #should not be changed
+        self.N = 3
+        self.H = 2
+        self.W = 2
+        self.C = 5
+        self.shape = [self.N, self.H, self.W, self.C]
+        self.use_softmax = True
+        self.reduction = 'mean'
+        self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype)
+        self.logits = getattr(
+            self, "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+        softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits)
+
+        self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+        self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True)
+
+        #1. numpy
+        expected = cross_entropy_soft_2d(
+            softmax,
+            self.labels,
+            self.axis,
+            self.N,
+            self.H,
+            self.W,
+            weight=self.weight,
+            reduction=self.reduction,
+            ignore_index=self.ignore_index)
+
+        paddle.set_device("cpu")
+        paddle.disable_static()
+
+        #2. dygraph
+        paddle_loss_none_weight = paddle.nn.functional.cross_entropy(
+            fluid.dygraph.to_variable(self.logits),
+            fluid.dygraph.to_variable(self.labels),
+            soft_label=True,
+            axis=self.axis,
+            weight=fluid.dygraph.to_variable(self.weight),
+            reduction=self.reduction)
+        dy_ret_value = paddle_loss_none_weight.numpy()
+
+        #3. static
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype)
+            label = fluid.data(
+                name='label',
+                shape=[self.N, self.H, self.W, self.C],
+                dtype=self.dtype)
+            weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype)
+
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=self.reduction, soft_label=True)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': self.logits,
+                                     'label': self.labels,
+                                     "weight": self.weight
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+        paddle.disable_static()
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
+    ###soft_label test end
+
     def test_cross_entropy_loss_1d_with_mean_ignore(self):
-        input_np = np.random.random([2, 4]).astype(np.float64)
+        input_np = np.random.random([2, 4]).astype(self.dtype)
         label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
         paddle.enable_static()
         prog = fluid.Program()
@@ -102,7 +732,7 @@ class CrossEntropyLoss(unittest.TestCase):
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[2, 4], dtype='float64')
+            input = fluid.data(name='input', shape=[2, 4], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(ignore_index=0)
             ret = cross_entropy_loss(input, label)
@@ -130,21 +760,62 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
+    def test_cross_entropy_loss_1d_with_mean_ignore_negative(self):
+        N = 100
+        C = 200
+        input_np = np.random.random([N, C]).astype(self.dtype)
+        label_np = -np.ones((N)).astype(np.int64)
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[N, C], dtype=self.dtype)
+            label = fluid.data(name='label', shape=[N], dtype='int64')
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                ignore_index=-1)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+
+        with fluid.dygraph.guard():
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                axis=1, ignore_index=-1)
+            dy_ret = cross_entropy_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np, ignore_index=-1)[0]
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
     def test_cross_entropy_loss_1d_with_weight_mean_ignore(self):
-        input_np = np.random.random([2, 4]).astype(np.float64)
-        label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
-        weight_np = np.random.random([4]).astype(np.float64)  #shape:C
+        N = 100
+        C = 200
+        input_np = np.random.random([N, C]).astype(self.dtype)
+        label_np = np.random.randint(0, C, size=(N)).astype(np.int64)
+        weight_np = np.random.random([C]).astype(self.dtype)
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[2, 4], dtype='float64')
-            label = fluid.data(name='label', shape=[2], dtype='int64')
+            input = fluid.data(name='input', shape=[N, C], dtype=self.dtype)
+            label = fluid.data(name='label', shape=[N], dtype='int64')
             weight = fluid.data(
-                name='weight', shape=[4],
-                dtype='float64')  #weight for each class
+                name='weight', shape=[C],
+                dtype=self.dtype)  #weight for each class
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, ignore_index=0)
             ret = cross_entropy_loss(input, label)
@@ -158,8 +829,6 @@ class CrossEntropyLoss(unittest.TestCase):
                                  },
                                  fetch_list=[ret])
             self.assertIsNotNone(static_ret)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np)[0]
 
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -173,25 +842,26 @@ class CrossEntropyLoss(unittest.TestCase):
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_1d(
             input_np, label_np, weight=weight_np, ignore_index=0)[0]
+
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_mean(self):
-        input_np = np.random.random([2, 4]).astype(np.float64)
+        input_np = np.random.random([2, 4]).astype(self.dtype)
         label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
-        weight_np = np.random.random([4]).astype(np.float64)  #shape:C
+        weight_np = np.random.random([4]).astype(self.dtype)  #shape:C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[2, 4], dtype='float64')
+            input = fluid.data(name='input', shape=[2, 4], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2], dtype='int64')
             weight = fluid.data(
                 name='weight', shape=[4],
-                dtype='float64')  #weight for each class
+                dtype=self.dtype)  #weight for each class
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight)
             ret = cross_entropy_loss(input, label)
 
@@ -222,18 +892,18 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_sum(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
-        weight_np = np.random.random([200]).astype(np.float64)  #C
+        weight_np = np.random.random([200]).astype(self.dtype)  #C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            weight = fluid.data(name='weight', shape=[200], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='sum')
             ret = cross_entropy_loss(input, label)
@@ -262,18 +932,20 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_none(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
-        weight_np = np.random.random([200]).astype(np.float64)  #C
+        weight_np = np.random.random([200]).astype(self.dtype)  #C
+
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            weight = fluid.data(name='weight', shape=[200], dtype=self.dtype)
+
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='none')
             ret = cross_entropy_loss(input, label)
@@ -304,18 +976,18 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_none_func(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N
-        weight_np = np.random.random([200]).astype(np.float64)  #C
+        weight_np = np.random.random([200]).astype(self.dtype)  #C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            weight = fluid.data(name='weight', shape=[200], dtype=self.dtype)
             ret = paddle.nn.functional.cross_entropy(
                 input, label, weight=weight, reduction='none')
 
@@ -345,18 +1017,18 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_mean(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
-        weight_np = np.random.random([200]).astype(np.float64)  #C
+        weight_np = np.random.random([200]).astype(self.dtype)  #C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[100], dtype='float64')
+            weight = fluid.data(name='weight', shape=[100], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss()
             ret = cross_entropy_loss(input, label)
             exe = fluid.Executor(place)
@@ -378,7 +1050,7 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_sum(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
         paddle.enable_static()
         prog = fluid.Program()
@@ -386,7 +1058,7 @@ class CrossEntropyLoss(unittest.TestCase):
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='sum')
@@ -411,7 +1083,7 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_none(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
         label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
         paddle.enable_static()
         prog = fluid.Program()
@@ -419,7 +1091,7 @@ class CrossEntropyLoss(unittest.TestCase):
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
+            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
             label = fluid.data(name='label', shape=[100], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='none')
@@ -446,10 +1118,10 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_none(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW1
-        weight_np = np.random.random(size=(3, )).astype(np.float64)  #C
+        weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
 
         paddle.enable_static()
         prog = fluid.Program()
@@ -458,9 +1130,9 @@ class CrossEntropyLoss(unittest.TestCase):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
-            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='none')
             ret = cross_entropy_loss(input, label)
@@ -491,10 +1163,10 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_mean(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
-        weight_np = np.random.random(size=(3, )).astype(np.float64)  #C
+        weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
         paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
@@ -502,9 +1174,9 @@ class CrossEntropyLoss(unittest.TestCase):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
-            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='mean')
             ret = cross_entropy_loss(input, label)
@@ -533,10 +1205,10 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_sum(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
-        weight_np = np.random.random(size=(3, )).astype(np.float64)  #C
+        weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
         paddle.enable_static()
 
         prog = fluid.Program()
@@ -545,9 +1217,9 @@ class CrossEntropyLoss(unittest.TestCase):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
-            weight = fluid.data(name='weight', shape=[3], dtype='float64')
+            weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='sum')
             ret = cross_entropy_loss(input, label)
@@ -576,7 +1248,7 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_none(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
         paddle.enable_static()
@@ -586,7 +1258,7 @@ class CrossEntropyLoss(unittest.TestCase):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='none')
@@ -615,7 +1287,7 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_mean(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
         paddle.enable_static()
@@ -625,7 +1297,7 @@ class CrossEntropyLoss(unittest.TestCase):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='mean')
@@ -654,7 +1326,7 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_sum(self):
-        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
         label_np = np.random.randint(
             0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
         paddle.enable_static()
@@ -664,7 +1336,7 @@ class CrossEntropyLoss(unittest.TestCase):
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[2, 2, 2, 3], dtype='float64')
+                name='input', shape=[2, 2, 2, 3], dtype=self.dtype)
             label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='sum')
@@ -692,5 +1364,38 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
 
+class TestCrossEntropyFAPIError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+
+            def test_LabelValue():
+                input_data = paddle.rand(shape=[20, 100])
+                label_data = paddle.randint(
+                    0, 100, shape=[20, 1], dtype="int64")
+                label_data[0] = 255
+                weight_data = paddle.rand([100])
+                paddle.nn.functional.cross_entropy(
+                    input=input_data,
+                    label=label_data,
+                    weight=weight_data,
+                    ignore_index=255)
+
+            self.assertRaises(ValueError, test_LabelValue)
+
+            def test_LabelValueNeg():
+                input_data = paddle.rand(shape=[20, 100])
+                label_data = paddle.randint(
+                    0, 100, shape=[20, 1], dtype="int64")
+                label_data[0] = -1
+                weight_data = paddle.rand([100])
+                paddle.nn.functional.cross_entropy(
+                    input=input_data,
+                    label=label_data,
+                    weight=weight_data,
+                    ignore_index=-1)
+
+            self.assertRaises(ValueError, test_LabelValueNeg)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
new file mode 100644
index 0000000000000000000000000000000000000000..623b7e68b3f7f722361ee83f4477632bd4502d72
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.dygraph as dg
+from op_test import OpTest
+
+
+class TestTensorBackward(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_tensor_backward(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 100]).astype(dtype)
+            y = np.random.random([100, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    z_tensor.backward(grad_tensor)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
+
+
+class TestBackwardAPI(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_backward_api(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+                    z_tensor2 = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward([z_tensor1, z_tensor2],
+                                             [grad_tensor, grad_tensor], True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(
+                        np.allclose(x_grad * 2, x_tensor.grad.numpy()))
+
+    def test_backward_single_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward(z_tensor1, grad_tensor, True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
+
+    def test_backward_none_grad_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.ones(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    paddle.autograd.backward(z_tensor1, None)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_data_generator.py b/python/paddle/fluid/tests/unittests/test_data_generator.py
index 6381cb364026369734f1de3747d50cc1ca17d5ef..69d8e01fd464afc724d286740b6c8f42929dd387 100644
--- a/python/paddle/fluid/tests/unittests/test_data_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_data_generator.py
@@ -95,6 +95,32 @@ class MyMultiSlotDataGenerator_error_5(fleet.MultiSlotDataGenerator):
         return data_iter
 
 
+class MyMultiSlotStringDataGenerator_zip(fleet.MultiSlotStringDataGenerator):
+    def generate_sample(self, line):
+        def data_iter():
+            for i in range(40):
+                if i == 1:
+                    yield None
+                feature_name = ["words", "label"]
+                data = [["1", "2", "3", "4"], ["0"]]
+                yield zip(feature_name, data)
+
+        return data_iter
+
+
+class MyMultiSlotDataGenerator_zip(fleet.MultiSlotDataGenerator):
+    def generate_sample(self, line):
+        def data_iter():
+            for i in range(40):
+                if i == 1:
+                    yield None
+                feature_name = ["words", "label"]
+                data = [[1, 2, 3, 4], [0]]
+                yield zip(feature_name, data)
+
+        return data_iter
+
+
 class TestMultiSlotDataGenerator(unittest.TestCase):
     def test_MultiSlotDataGenerator_basic(self):
         my_ms_dg = MyMultiSlotDataGenerator()
@@ -149,5 +175,19 @@ class TestMultiSlotDataGenerator_error_5(unittest.TestCase):
             my_ms_dg.run_from_memory()
 
 
+class TestMultiSlotStringDataGeneratorZip(unittest.TestCase):
+    def test_MultiSlotStringDataGenerator_zip(self):
+        my_ms_dg = MyMultiSlotStringDataGenerator_zip()
+        my_ms_dg.set_batch(1)
+        my_ms_dg.run_from_memory()
+
+
+class TestMultiSlotDataGeneratorZip(unittest.TestCase):
+    def test_MultiSlotDataGenerator_zip(self):
+        my_ms_dg = MyMultiSlotDataGenerator_zip()
+        my_ms_dg.set_batch(1)
+        my_ms_dg.run_from_memory()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
index 97b6594eb382507ccbbb8b6bfad8e5631d534010..7dc5dc70618e66599e42681e292c2e685a4c41d4 100755
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -21,6 +21,8 @@ from paddle.static import Program, program_guard
 import unittest
 import paddle.fluid.core as core
 import sys
+import warnings
+import paddle.utils.deprecated as deprecated
 
 LOWEST_WARNING_POSTION = 3
 ERROR_WARNING_POSTION = sys.maxsize
@@ -149,6 +151,45 @@ class TestDeprecatedDocorator(unittest.TestCase):
         # testting
         self.assertGreater(expected, captured)
 
+    def test_tensor_gradient(self):
+        paddle.__version__ = '2.1.0'
+
+        x = paddle.to_tensor(5., stop_gradient=False)
+        y = paddle.pow(x, 4.0)
+        y.backward()
+
+        with warnings.catch_warnings(record=True) as w:
+            grad = x.gradient()
+            assert (
+                'API "paddle.fluid.dygraph.varbase_patch_methods.gradient" is '
+                'deprecated since 2.1.0') in str(w[-1].message)
+
+    def test_softmax_with_cross_entropy(self):
+        paddle.__version__ = '2.0.0'
+
+        data = np.random.rand(128).astype("float32")
+        label = np.random.rand(1).astype("int64")
+        data = paddle.to_tensor(data)
+        label = paddle.to_tensor(label)
+        linear = paddle.nn.Linear(128, 100)
+        x = linear(data)
+
+        with warnings.catch_warnings(record=True) as w:
+            out = paddle.nn.functional.softmax_with_cross_entropy(
+                logits=x, label=label)
+            assert (
+                'API "paddle.nn.functional.loss.softmax_with_cross_entropy" is '
+                'deprecated since 2.0.0') in str(w[-1].message)
+
+    def test_deprecated_error(self):
+        paddle.__version__ = '2.1.0'
+
+        @deprecated(since="2.1.0", level=2)
+        def deprecated_error_func():
+            pass
+
+        self.assertRaises(RuntimeError, deprecated_error_func)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_detach.py b/python/paddle/fluid/tests/unittests/test_detach.py
index 38cdd9b727fc5c61f933372be61cf971f289ef2e..5a31418205c329d5c790982b1dfdbe05890b8349 100644
--- a/python/paddle/fluid/tests/unittests/test_detach.py
+++ b/python/paddle/fluid/tests/unittests/test_detach.py
@@ -152,18 +152,8 @@ class Test_Detach(unittest.TestCase):
     def test_detach_exception(self):
         x = fluid.layers.data(name="a", shape=[3, 4], dtype='float32')
         y = fluid.layers.fc(input=x, size=10, bias_attr=True)
-        try:
+        with self.assertRaises(AssertionError):
             y_detach = y.detach()
-        except Exception as e:
-            # Here is to check
-            assert type(e) == AssertionError
-            assert str(e) == (
-                "'detach' should be called by imperative Varible "
-                "in imperative mode, please run it in dygraph mode. You can "
-                "turn off paddle.enable_static() if you are in static mode, "
-                "or turn off ProgramTranslator if you are using "
-                "@paddle.jit.to_static. If you have to run ProgramTranslator, "
-                "please use other API to replace 'detach'")
 
 
 class TestInplace(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py
index 195337e80defa930a67d6d9e08dc585d07cdb6fa..08697a080445e606f17bdde83384eef391713721 100644
--- a/python/paddle/fluid/tests/unittests/test_device.py
+++ b/python/paddle/fluid/tests/unittests/test_device.py
@@ -15,54 +15,39 @@
 from __future__ import print_function
 
 import unittest
-from op_test import OpTest
 
-import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
-import warnings
-import paddle
 
 
 class TestStaticDeviceManage(unittest.TestCase):
-    def test_cpu_device(self):
-        paddle.set_device('cpu')
+    def _test_device(self, device_name, device_class):
+        paddle.set_device(device_name)
+
         out1 = paddle.zeros(shape=[1, 3], dtype='float32')
         out2 = paddle.ones(shape=[1, 3], dtype='float32')
         out3 = paddle.concat(x=[out1, out2], axis=0)
-        exe = paddle.fluid.Executor()
+
+        exe = paddle.static.Executor()
         exe.run(paddle.fluid.default_startup_program())
         res = exe.run(fetch_list=[out3])
+
         device = paddle.get_device()
-        self.assertEqual(isinstance(exe.place, core.CPUPlace), True)
-        self.assertEqual(device, "cpu")
+        self.assertEqual(isinstance(exe.place, device_class), True)
+        self.assertEqual(device, device_name)
+
+    def test_cpu_device(self):
+        self._test_device("cpu", core.CPUPlace)
 
     def test_gpu_device(self):
         if core.is_compiled_with_cuda():
-            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
-            out2 = paddle.ones(shape=[1, 3], dtype='float32')
-            out3 = paddle.concat(x=[out1, out2], axis=0)
-            paddle.set_device('gpu:0')
-            exe = paddle.fluid.Executor()
-            exe.run(paddle.fluid.default_startup_program())
-            res = exe.run(fetch_list=[out3])
-            device = paddle.get_device()
-            self.assertEqual(isinstance(exe.place, core.CUDAPlace), True)
-            self.assertEqual(device, "gpu:0")
+            self._test_device("gpu:0", core.CUDAPlace)
 
     def test_xpu_device(self):
         if core.is_compiled_with_xpu():
-            out1 = paddle.zeros(shape=[1, 3], dtype='float32')
-            out2 = paddle.ones(shape=[1, 3], dtype='float32')
-            out3 = paddle.concat(x=[out1, out2], axis=0)
-            paddle.set_device('xpu:0')
-            exe = paddle.fluid.Executor()
-            exe.run(paddle.fluid.default_startup_program())
-            res = exe.run(fetch_list=[out3])
-            device = paddle.get_device()
-            self.assertEqual(isinstance(exe.place, core.XPUPlace), True)
-            self.assertEqual(device, "xpu:0")
+            self._test_device("xpu:0", core.XPUPlace)
 
 
 class TestImperativeDeviceManage(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
index d615f7cb7044e588557b1b14dbb54a881bdb8730..f3878dfa2bc76f847b77bb795badb63bdae84bb1 100644
--- a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
@@ -119,8 +119,8 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
         init_ops_count = 5 if name == "momentum" else 9
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), init_ops_count)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
         # check dgc op regularization coeff
         train_ops = program.global_block().ops
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
old mode 100644
new mode 100755
index d73698e7e024a8f8508ac67fdcd6f2026be4cb38..edc510e4e766d7f1e8898c831204806b0b8f954d
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -501,7 +501,12 @@ class TestParallelDyGraphRunnerBase(object):
                     type(self).__name__,
                     "begin to prepare context in dygraph with nccl2")
                 dygraph.parallel.prepare_context(strategy)
-                model = dygraph.parallel.DataParallel(model, strategy)
+                if not args.find_unused_parameters:
+                    model = dygraph.parallel.DataParallel(
+                        model, strategy, find_unused_parameters=False)
+                else:
+                    model = dygraph.parallel.DataParallel(
+                        model, strategy, find_unused_parameters=True)
                 print_to_err(type(self).__name__, "model built in dygraph")
             out_losses = []
             print_to_err(type(self).__name__, "begin to run dygraph training")
@@ -543,7 +548,10 @@ class TestParallelDyGraphRunnerBase(object):
         # 4. train model
         model, train_reader, opt = self.get_model()
         if args.update_method == "nccl2":
-            model = paddle.DataParallel(model)
+            if args.find_unused_parameters:
+                model = paddle.DataParallel(model, find_unused_parameters=True)
+            else:
+                model = paddle.DataParallel(model, find_unused_parameters=False)
 
         out_losses = []
         for step_id, data in enumerate(train_reader()):
@@ -574,9 +582,14 @@ class TestParallelDyGraphRunnerBase(object):
         # get trainer id
         args.trainer_id = paddle.distributed.get_rank()
 
+        # set strategy
+        strategy = fleet.DistributedStrategy()
+        if args.find_unused_parameters:
+            strategy.find_unused_parameters = True
+
         # 3. init parallel env
         if args.update_method == "nccl2" or "bkcl":
-            fleet.init(is_collective=True)
+            fleet.init(is_collective=True, strategy=strategy)
 
         # 4. train model
         model, train_reader, opt = self.get_model()
@@ -628,6 +641,7 @@ def runtime_main(test_class):
     parser.add_argument('--use_xpu', action='store_true')
     parser.add_argument('--use_dgc', action='store_true')
     parser.add_argument('--accumulate_gradient', action='store_true')
+    parser.add_argument('--find_unused_parameters', action='store_true')
     parser.add_argument('--use_reduce', action='store_true')
     parser.add_argument('--dc_asgd', action='store_true')
     parser.add_argument('--hogwild', action='store_true')
@@ -726,6 +740,7 @@ class TestDistBase(unittest.TestCase):
         self._save_model = False
         self._fuse_all_reduce = None
         self._accumulate_gradient = False
+        self._find_unused_parameters = False
         self._setup_config()
 
         global DIST_UT_PORT
@@ -852,6 +867,9 @@ class TestDistBase(unittest.TestCase):
         if self._accumulate_gradient:
             cmd += " --accumulate_gradient"
 
+        if self._find_unused_parameters:
+            cmd += " --find_unused_parameters"
+
         env_local.update(envs)
         print("local_cmd: {}, env: {}".format(cmd, env_local))
 
@@ -1021,6 +1039,9 @@ class TestDistBase(unittest.TestCase):
         if self._accumulate_gradient:
             tr_cmd += " --accumulate_gradient"
 
+        if self._find_unused_parameters:
+            tr_cmd += " --find_unused_parameters"
+
         if self._pipeline_mode:
             tr_cmd += " --use_pipeline"
         if self._mp_mode:
@@ -1107,6 +1128,7 @@ class TestDistBase(unittest.TestCase):
         if check_error_log:
             print("outs[0]:", outs[0])
             print("outs[1]:", outs[1])
+
         return pickle.loads(outs[0]), pickle.loads(outs[1])
 
     def _run_pipeline(self, model, envs, check_error_log, log_name):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index 03d7251f8292fb0bdd1c34763ad2fbdbd1cf5707..e84e91de0ba79ac195540dce620034e30e70f0d1 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -18,6 +18,7 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu
 import paddle.distributed.fleet as fleet
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.fluid as fluid
+import paddle
 """
     high level unit test for distribute fleet.
 """
@@ -112,23 +113,21 @@ class FleetDistRunnerBase(object):
 
     def build_optimizer(self, avg_cost, strategy):
         use_grad_clip = int(os.getenv('GRAD_CLIP', 0))
+        grad_clip = None
         if use_grad_clip:
             # 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm
             if use_grad_clip == 1:
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByValue(2.0))
+                grad_clip = paddle.nn.ClipGradByValue(min=-5.0, max=5.0)
             elif use_grad_clip == 2:
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByNorm(2.0))
+                grad_clip = paddle.nn.ClipGradByNorm(2.0)
             elif use_grad_clip == 3:
-                fluid.clip.set_gradient_clip(
-                    clip=fluid.clip.GradientClipByGlobalNorm(2.0))
+                grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
 
         use_decay = int(os.getenv("USE_DECAY", "0"))
         if use_decay:
             scheduler = paddle.optimizer.lr.ExponentialDecay(
                 learning_rate=LEARNING_RATE, gamma=0.999, verbose=True)
-            optimizer = fluid.optimizer.SGD(scheduler)
+            optimizer = fluid.optimizer.SGD(scheduler, grad_clip=grad_clip)
             """
             # learning rate decay method before 2.0
             optimizer = fluid.optimizer.SGD(
@@ -139,7 +138,7 @@ class FleetDistRunnerBase(object):
                     staircase=True)) 
             """
         else:
-            optimizer = fluid.optimizer.SGD(LEARNING_RATE)
+            optimizer = fluid.optimizer.SGD(LEARNING_RATE, grad_clip=grad_clip)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
index 3c68af474cf7cae96a9fa62688460f84123438f5..f9509d60072f877f0713d6da23018153dc138304 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
@@ -16,53 +16,66 @@ from __future__ import print_function
 
 import os
 import unittest
-import paddle.fluid as fluid
-import paddle.fluid.incubate.fleet.base.role_maker as role_maker
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig
 from test_dist_fleet_base import TestFleetBase
-from dist_fleet_simnet_bow import train_network
 
 
-@unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged")
-class TestDistGeoClipByGlobalNormTranspiler(unittest.TestCase):
-    def test_pserver(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.SERVER,
-            worker_num=2,
-            server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"])
+class TestDistGeoClipByGlobalNorm(TestFleetBase):
+    def _setup_config(self):
+        self._mode = "geo"
+        self._reader = "dataset"
+        self._geo_sgd_need_push_nums = 5
+        self._grad_clip_mode = 3
 
-        fleet.init(role)
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": ""
+        }
+        required_envs.update(need_envs)
 
-        batch_size = 128
-        is_sparse = True
-        is_distribute = False
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
-        strategy = DistributeTranspilerConfig()
-        strategy.sync_mode = False
-        strategy.geo_sgd_mode = True
-        strategy.geo_sgd_need_push_nums = 5
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
-        avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse)
-        fluid.clip.set_gradient_clip(
-            clip=fluid.clip.GradientClipByGlobalNorm(2.0))
+    def _setup_config(self):
+        self._sync_mode = False
+        self._grad_clip_mode = 2
 
-        optimizer = fluid.optimizer.SGD(0.1)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy)
-        optimizer.minimize(avg_cost)
+    def check_with_place(self,
+                         model_file,
+                         delta=1e-3,
+                         check_error_log=False,
+                         need_envs={}):
+        required_envs = {
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
+            "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
+            "http_proxy": ""
+        }
+        required_envs.update(need_envs)
+
+        tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
-        pserver_startup_program = fleet.startup_program
-        pserver_mian_program = fleet.main_program
+    def test_dist_train(self):
+        self.check_with_place(
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
-@unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged")
-class TestDistGeoClipByGlobalNorm(TestFleetBase):
+class TestDistASyncClipByValue(TestFleetBase):
     def _setup_config(self):
-        self._mode = "geo"
+        self._mode = "async"
         self._reader = "dataset"
-        self._geo_sgd_need_push_nums = 5
-        self._grad_clip_mode = 3
+        self._grad_clip_mode = 1
 
     def check_with_place(self,
                          model_file,
@@ -84,8 +97,11 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase):
         self.check_with_place(
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
+
+class TestDistASyncClipByNorm(TestFleetBase):
     def _setup_config(self):
-        self._sync_mode = False
+        self._mode = "async"
+        self._reader = "dataset"
         self._grad_clip_mode = 2
 
     def check_with_place(self,
@@ -109,7 +125,6 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase):
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
 
 
-@unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged")
 class TestDistASyncClipByGlobalNorm(TestFleetBase):
     def _setup_config(self):
         self._mode = "async"
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
index 16584ee50081ae0150c004b214e0b94c495fbd73..a82866a797db152a1538261559bc0c6ee919bd2b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
@@ -23,7 +23,6 @@ import os
 
 paddle.enable_static()
 
-
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
new file mode 100644
index 0000000000000000000000000000000000000000..cad7d067e9019bfb6fce0c5b95c2f92f36435dec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+import shutil
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+class TestPSPassWithBow(unittest.TestCase):
+    def net(self):
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(
+                cond_3,
+                fluid.layers.fill_constant(
+                    shape=[1], value=batch_size * 1.0, dtype='float64'),
+                name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=cos_q_pt,
+                    shape=[-1, 1],
+                    value=margin,
+                    dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(
+            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(
+            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(
+            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36002,127.0.0.2:36002"
+        os.environ["TRAINING_ROLE"] = "TRAINER"
+        os.environ["FLAGS_selected_gpus"] = "0"
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        loss, acc, _ = self.net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {"use_ps_gpu": 1, "launch_barrier": False}
+        strategy.a_sync_configs = configs
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(loss)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c1ccd8a8a763d8e7ea3062227b90366d31c986
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import unittest
+import tempfile
+import shutil
+
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed.fleet.base.role_maker as role_maker
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+# For Net
+base_lr = 0.2
+emb_lr = base_lr * 3
+dict_dim = 1500
+emb_dim = 128
+hid_dim = 128
+margin = 0.1
+sample_rate = 1
+batch_size = 4
+
+
+class TestPSPassWithBow(unittest.TestCase):
+    def net(self):
+        def get_acc(cos_q_nt, cos_q_pt, batch_size):
+            cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
+            cond = fluid.layers.cast(cond, dtype='float64')
+            cond_3 = fluid.layers.reduce_sum(cond)
+            acc = fluid.layers.elementwise_div(
+                cond_3,
+                fluid.layers.fill_constant(
+                    shape=[1], value=batch_size * 1.0, dtype='float64'),
+                name="simnet_acc")
+            return acc
+
+        def get_loss(cos_q_pt, cos_q_nt):
+            loss_op1 = fluid.layers.elementwise_sub(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=cos_q_pt,
+                    shape=[-1, 1],
+                    value=margin,
+                    dtype='float32'),
+                cos_q_pt)
+            loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
+            loss_op3 = fluid.layers.elementwise_max(
+                fluid.layers.fill_constant_batch_size_like(
+                    input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'),
+                loss_op2)
+            avg_cost = fluid.layers.mean(loss_op3)
+            return avg_cost
+
+        is_distributed = False
+        is_sparse = True
+
+        # query
+        q = fluid.layers.data(
+            name="query_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        q_emb = fluid.contrib.layers.sparse_embedding(
+            input=q,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
+        # vsum
+        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
+        q_ss = fluid.layers.softsign(q_sum)
+        # fc layer after conv
+        q_fc = fluid.layers.fc(
+            input=q_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__q_fc__",
+                learning_rate=base_lr))
+        # label data
+        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+        # pt
+        pt = fluid.layers.data(
+            name="pos_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        pt_emb = fluid.contrib.layers.sparse_embedding(
+            input=pt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
+        # vsum
+        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
+        pt_ss = fluid.layers.softsign(pt_sum)
+        # fc layer
+        pt_fc = fluid.layers.fc(
+            input=pt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        # nt
+        nt = fluid.layers.data(
+            name="neg_title_ids", shape=[1], dtype="int64", lod_level=1)
+        # embedding
+        nt_emb = fluid.contrib.layers.sparse_embedding(
+            input=nt,
+            size=[dict_dim, emb_dim],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__emb__",
+                learning_rate=emb_lr))
+        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
+        # vsum
+        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
+        nt_ss = fluid.layers.softsign(nt_sum)
+        # fc layer
+        nt_fc = fluid.layers.fc(
+            input=nt_ss,
+            size=hid_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01),
+                name="__fc__",
+                learning_rate=base_lr),
+            bias_attr=fluid.ParamAttr(name="__fc_b__"))
+        cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
+        cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
+        # loss
+        avg_cost = get_loss(cos_q_pt, cos_q_nt)
+        # acc
+        acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
+        return [avg_cost, acc, cos_q_pt]
+
+    def test(self):
+        os.environ["PADDLE_PSERVER_NUMS"] = "2"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ["POD_IP"] = "127.0.0.1"
+        os.environ["PADDLE_PORT"] = "36001"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "2"
+        os.environ[
+            "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
+        os.environ["TRAINING_ROLE"] = "PSERVER"
+
+        role = role_maker.PaddleCloudRoleMaker()
+        fleet.init(role)
+        loss, acc, _ = self.net()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        configs = {"use_ps_gpu": 1}
+        strategy.a_sync_configs = configs
+        strategy.a_sync = True
+        optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(loss)
+
+        fleet.init_server()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_tree_index.py b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py
new file mode 100644
index 0000000000000000000000000000000000000000..feb52b18dad3d81edbb49fa045aff0ea88ec8be6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from paddle.dataset.common import download, DATA_HOME
+from paddle.distributed.fleet.dataset import TreeIndex
+
+
+class TestTreeIndex(unittest.TestCase):
+    def test_tree_index(self):
+        path = download(
+            "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
+            "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
+
+        tree = TreeIndex("demo", path)
+        height = tree.height()
+        branch = tree.branch()
+        self.assertTrue(height == 14)
+        self.assertTrue(branch == 2)
+        self.assertEqual(tree.total_node_nums(), 15581)
+        self.assertEqual(tree.emb_size(), 5171136)
+
+        # get_layer_codes
+        layer_node_ids = []
+        layer_node_codes = []
+        for i in range(tree.height()):
+            layer_node_codes.append(tree.get_layer_codes(i))
+            layer_node_ids.append(
+                [node.id() for node in tree.get_nodes(layer_node_codes[-1])])
+
+        all_leaf_ids = [node.id() for node in tree.get_all_leafs()]
+        self.assertEqual(sum(all_leaf_ids), sum(layer_node_ids[-1]))
+
+        # get_travel
+        travel_codes = tree.get_travel_codes(all_leaf_ids[0])
+        travel_ids = [node.id() for node in tree.get_nodes(travel_codes)]
+
+        for i in range(height):
+            self.assertIn(travel_ids[i], layer_node_ids[height - 1 - i])
+            self.assertIn(travel_codes[i], layer_node_codes[height - 1 - i])
+
+        # get_ancestor
+        ancestor_codes = tree.get_ancestor_codes([all_leaf_ids[0]], height - 2)
+        ancestor_ids = [node.id() for node in tree.get_nodes(ancestor_codes)]
+
+        self.assertEqual(ancestor_ids[0], travel_ids[1])
+        self.assertEqual(ancestor_codes[0], travel_codes[1])
+
+        # get_pi_relation
+        pi_relation = tree.get_pi_relation([all_leaf_ids[0]], height - 2)
+        self.assertEqual(pi_relation[all_leaf_ids[0]], ancestor_codes[0])
+
+        # get_travel_path
+        travel_path_codes = tree.get_travel_path(travel_codes[0],
+                                                 travel_codes[-1])
+        travel_path_ids = [
+            node.id() for node in tree.get_nodes(travel_path_codes)
+        ]
+
+        self.assertEquals(travel_path_ids + [travel_ids[-1]], travel_ids)
+        self.assertEquals(travel_path_codes + [travel_codes[-1]], travel_codes)
+
+        # get_children
+        children_codes = tree.get_children_codes(travel_codes[1], height - 1)
+        children_ids = [node.id() for node in tree.get_nodes(children_codes)]
+        self.assertIn(all_leaf_ids[0], children_ids)
+
+
+class TestIndexSampler(unittest.TestCase):
+    def test_layerwise_sampler(self):
+        path = download(
+            "https://paddlerec.bj.bcebos.com/tree-based/data/demo_tree.pb",
+            "tree_index_unittest", "cadec20089f5a8a44d320e117d9f9f1a")
+
+        tree = TreeIndex("demo", path)
+
+        layer_nodes = []
+        for i in range(tree.height()):
+            layer_codes = tree.get_layer_codes(i)
+            layer_nodes.append(
+                [node.id() for node in tree.get_nodes(layer_codes)])
+
+        sample_num = range(1, 10000)
+        start_sample_layer = 1
+        seed = 0
+        sample_layers = tree.height() - start_sample_layer
+        sample_num = sample_num[:sample_layers]
+        layer_sample_counts = list(sample_num) + [1] * (sample_layers -
+                                                        len(sample_num))
+        total_sample_num = sum(layer_sample_counts) + len(layer_sample_counts)
+        tree.init_layerwise_sampler(sample_num, start_sample_layer, seed)
+
+        ids = [315757, 838060, 1251533, 403522, 2473624, 3321007]
+        parent_path = {}
+        for i in range(len(ids)):
+            tmp = tree.get_travel_codes(ids[i], start_sample_layer)
+            parent_path[ids[i]] = [node.id() for node in tree.get_nodes(tmp)]
+
+        # check sample res with_hierarchy = False
+        sample_res = tree.layerwise_sample(
+            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], False)
+        idx = 0
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res[idx + j][0] == 315757)
+                self.assertTrue(sample_res[idx + j][1] == 838060)
+                self.assertTrue(sample_res[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res[idx + j][3] == 1)
+                    self.assertTrue(
+                        sample_res[idx + j][2] == parent_path[2473624][i])
+                else:
+                    self.assertTrue(sample_res[idx + j][3] == 0)
+                    self.assertTrue(
+                        sample_res[idx + j][2] != parent_path[2473624][i])
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == total_sample_num)
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res[idx + j][0] == 1251533)
+                self.assertTrue(sample_res[idx + j][1] == 403522)
+                self.assertTrue(sample_res[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res[idx + j][3] == 1)
+                    self.assertTrue(
+                        sample_res[idx + j][2] == parent_path[3321007][i])
+                else:
+                    self.assertTrue(sample_res[idx + j][3] == 0)
+                    self.assertTrue(
+                        sample_res[idx + j][2] != parent_path[3321007][i])
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == total_sample_num * 2)
+
+        # check sample res with_hierarchy = True
+        sample_res_with_hierarchy = tree.layerwise_sample(
+            [[315757, 838060], [1251533, 403522]], [2473624, 3321007], True)
+        idx = 0
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res_with_hierarchy[idx + j][0] ==
+                                parent_path[315757][i])
+                self.assertTrue(sample_res_with_hierarchy[idx + j][1] ==
+                                parent_path[838060][i])
+                self.assertTrue(
+                    sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 1)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] ==
+                                    parent_path[2473624][i])
+                else:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 0)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] !=
+                                    parent_path[2473624][i])
+
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == total_sample_num)
+        layer = tree.height() - 1
+        for i in range(len(layer_sample_counts)):
+            for j in range(layer_sample_counts[0 - (i + 1)] + 1):
+                self.assertTrue(sample_res_with_hierarchy[idx + j][0] ==
+                                parent_path[1251533][i])
+                self.assertTrue(sample_res_with_hierarchy[idx + j][1] ==
+                                parent_path[403522][i])
+                self.assertTrue(
+                    sample_res_with_hierarchy[idx + j][2] in layer_nodes[layer])
+                if j == 0:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 1)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] ==
+                                    parent_path[3321007][i])
+                else:
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][3] == 0)
+                    self.assertTrue(sample_res_with_hierarchy[idx + j][2] !=
+                                    parent_path[3321007][i])
+
+            idx += layer_sample_counts[0 - (i + 1)] + 1
+            layer -= 1
+        self.assertTrue(idx == 2 * total_sample_num)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index d5790811df94f3938faeeb6efa1cb51090366787..f1c12c90490c2513e945900f67030ac5c22c3409 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -301,6 +301,41 @@ class UniformTest9(UniformTest):
                 name='values', shape=[dims], dtype='float32')
 
 
+class UniformTest10(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are list.
+        self.low_np = np.random.randn(batch_size,
+                                      dims).astype('float32').tolist()
+        self.high_np = np.random.uniform(
+            5.0, 15.0, (batch_size, dims)).astype('float32').tolist()
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest11(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are tuple.
+        self.low_np = tuple(
+            np.random.randn(batch_size, dims).astype('float32').tolist())
+        self.high_np = tuple(
+            np.random.uniform(5.0, 15.0, (batch_size, dims)).astype('float32')
+            .tolist())
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
 class NormalNumpy(DistributionNumpy):
     def __init__(self, loc, scale):
         self.loc = np.array(loc)
@@ -673,6 +708,66 @@ class NormalTest8(NormalTest):
                 name='other_scale', shape=[dims], dtype='float64')
 
 
+class NormalTest9(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are list.
+        self.loc_np = np.random.randn(batch_size,
+                                      dims).astype('float32').tolist()
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = self.scale_np.tolist()
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size,
+                                            dims).astype('float32').tolist()
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.other_scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+        self.other_scale_np = self.other_scale_np.tolist()
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest10(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are tuple.
+        self.loc_np = tuple(
+            np.random.randn(batch_size, dims).astype('float32').tolist())
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = tuple(self.scale_np.tolist())
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = tuple(
+            np.random.randn(batch_size, dims).astype('float32').tolist())
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.other_scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+        self.other_scale_np = tuple(self.other_scale_np.tolist())
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
 class CategoricalNumpy(DistributionNumpy):
     def __init__(self, logits):
         self.logits = np.array(logits).astype('float32')
@@ -961,6 +1056,38 @@ class CategoricalTest7(CategoricalTest):
         return np_probs
 
 
+class CategoricalTest8(CategoricalTest):
+    def init_dynamic_data(self, batch_size, dims):
+        # input logtis is 2-D list
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits = self.logits_np.tolist()
+        self.other_logits = self.other_logits_np.tolist()
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = self.logits_np.tolist()
+            self.other_logits_static = self.other_logits_np.tolist()
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class CategoricalTest9(CategoricalTest):
+    def init_dynamic_data(self, batch_size, dims):
+        # input logtis is 2-D tuple
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits = tuple(self.logits_np.tolist())
+        self.other_logits = tuple(self.other_logits_np.tolist())
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = tuple(self.logits_np.tolist())
+            self.other_logits_static = tuple(self.other_logits_np.tolist())
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
 class DistributionTestError(unittest.TestCase):
     def test_distribution_error(self):
         distribution = Distribution()
diff --git a/python/paddle/fluid/tests/unittests/test_dot_op.py b/python/paddle/fluid/tests/unittests/test_dot_op.py
index f65301f2d8697bb623dbd2fc5efa428973f3c8d4..a92104a5a6f4935739ea96b15bc9fafd086c8eee 100644
--- a/python/paddle/fluid/tests/unittests/test_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dot_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 from op_test import OpTest, skip_check_grad_ci
@@ -39,13 +40,33 @@ class DotOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ['X', 'Y'],
+                'Out',
+                user_defined_grads=[self.inputs['Y'], self.inputs['X']])
+        else:
+            self.check_grad(['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                user_defined_grads=[self.inputs['X']])
+        else:
+            self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ['X'],
+                'Out',
+                no_grad_set=set('Y'),
+                user_defined_grads=[self.inputs['Y']])
+        else:
+            self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [121]).astype(self.dtype)
@@ -64,6 +85,15 @@ class DotOpBatch(DotOp):
             [11, 12])
         self.out = np.sum(self.x * self.y, axis=1).reshape([11, 1])
 
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
 
 class TestDotOpError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index ba2abd72500788c4bbacf3c12d4ba711da1b01f3..89755d0365f2cb64ed2fd561ebcf169a89fc8e20 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -303,6 +303,12 @@ class TestDropoutFAPI(unittest.TestCase):
                 mode='downscale_in_infer')
             res10 = paddle.nn.functional.dropout(x=input, p=1., training=True)
             res11 = paddle.fluid.layers.dropout(x=input, dropout_prob=0.)
+            res12 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=(0, 1),
+                training=False,
+                mode='upscale_in_train')
 
             in_np = np.random.random([40, 40]).astype("float32")
             res_np = in_np
@@ -310,7 +316,8 @@ class TestDropoutFAPI(unittest.TestCase):
 
             exe = fluid.Executor(place)
             res_list = [
-                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11
+                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11,
+                res12
             ]
             for res in res_list:
                 fetches = exe.run(fluid.default_main_program(),
@@ -388,9 +395,16 @@ class TestDropoutFAPI(unittest.TestCase):
                     x=input, p=1., training=True)
                 dropout = paddle.fluid.dygraph.Dropout(p=0, )
                 res11 = dropout(input)
+                res12 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=(0, 1),
+                    training=False,
+                    mode='upscale_in_train')
 
             res_list = [
-                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11
+                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11,
+                res12
             ]
             for res in res_list:
                 self.assertTrue(np.allclose(res.numpy(), res_np))
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
new file mode 100755
index 0000000000000000000000000000000000000000..6de04c14bfa7080bcbf5e3b4c55f98da0f09a863
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.autograd import PyLayer
+from paddle.distributed.fleet.utils import recompute
+import random
+
+import paddle.fluid.layers as layers
+
+
+def get_fc_block(block_idx, input_size, is_last=False):
+    block_name = "block_" + str(block_idx)
+    block = paddle.nn.Sequential(
+        (block_name + "_fc_0", paddle.nn.Linear(
+            input_size, input_size, bias_attr=False)),
+        (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
+        (block_name + "_relu_1", paddle.nn.ReLU()),
+        (block_name + "_fc_1", paddle.nn.Linear(
+            input_size, input_size, bias_attr=False)),
+        (block_name + "_relu_2", paddle.nn.ReLU()), )
+    if is_last:
+        block.add_sublayer(
+            block_name + "_fc_2",
+            paddle.nn.Linear(
+                input_size, 1, bias_attr=False))  # add sublayer
+    else:
+        block.add_sublayer(
+            block_name + "_fc_2",
+            paddle.nn.Linear(
+                input_size, input_size, bias_attr=False))  # add sublayer
+    return block
+
+
+class Naive_fc_net(paddle.nn.Layer):
+    def __init__(self,
+                 input_size=10,
+                 recompute_blocks=[1, 3],
+                 recompute_kwargs={}):
+        super(Naive_fc_net, self).__init__()
+        self.recompute_blocks = recompute_blocks
+        self.recompute_kwargs = recompute_kwargs
+        self.runfunc0 = get_fc_block(0, input_size, is_last=False)
+        self.runfunc1 = get_fc_block(1, input_size, is_last=False)
+        self.runfunc2 = get_fc_block(2, input_size, is_last=False)
+        self.runfunc3 = get_fc_block(3, input_size, is_last=False)
+        self.runfunc4 = get_fc_block(4, input_size, is_last=True)
+
+    def forward(self, inputs):
+
+        if 0 in self.recompute_blocks:
+            inputs = recompute(self.runfunc0, inputs)
+        else:
+            inputs = self.runfunc0(inputs)
+
+        if 1 in self.recompute_blocks:
+            inputs = recompute(self.runfunc1, inputs)
+        else:
+            inputs = self.runfunc1(inputs)
+
+        if 2 in self.recompute_blocks:
+            inputs = recompute(self.runfunc2, inputs, **self.recompute_kwargs)
+        else:
+            inputs = self.runfunc2(inputs)
+
+        if 3 in self.recompute_blocks:
+            inputs = recompute(self.runfunc3, inputs)
+        else:
+            inputs = self.runfunc3(inputs)
+
+        if 4 in self.recompute_blocks:
+            inputs = recompute(self.runfunc4, inputs)
+        else:
+            inputs = self.runfunc4(inputs)
+
+        return inputs
+
+
+def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
+    gen = paddle.seed(10)
+    gen.manual_seed(10)
+    np.random.seed(10)
+    random.seed(10)
+
+    if cuda_state:
+        paddle.set_cuda_rng_state(cuda_state)
+
+    batch_size, input_size = 1, 10
+    model = Naive_fc_net(
+        input_size,
+        recompute_blocks=recompute_block,
+        recompute_kwargs=recompute_kwargs)
+    loss_fn = paddle.nn.MSELoss(reduction='mean')
+    optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                     parameters=model.parameters())
+
+    loss_ = []
+    param_ = []
+    grad_ = []
+    for step in range(10):
+        x_data = np.random.randn(batch_size, input_size).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        # x.stop_gradient = False
+        y_pred = model(x)
+        loss = y_pred.mean()
+
+        loss_.append(np.asarray(loss).tolist())
+        loss.backward()
+        optimizer.step()
+
+        param_.append(np.asarray(model.parameters()[9]).tolist())
+        grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist())
+
+        optimizer.clear_grad()
+    return loss_, param_, grad_
+
+
+class TestPyLayer(unittest.TestCase):
+    def test_fc_net_with_dropout(self):
+        def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
+            self.assertEqual(loss_ref, loss)
+            self.assertEqual(param_ref, param)
+            self.assertEqual(grad_ref, grad)
+
+        cuda_state = paddle.get_cuda_rng_state()
+        # without recompute
+        loss_ref, param_ref, grad_ref = run_model(
+            cuda_state, recompute_block=[])
+
+        # recompute second block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second to fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 2, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second & fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+    def test_recompute_kwargs(self):
+        paddle.set_device("gpu")
+        kwargs = {"is_test": False}
+        with self.assertRaises(ValueError):
+            loss_ref, param_ref, grad_ref = run_model(
+                None, recompute_block=[2], recompute_kwargs=kwargs)
+
+    def test_recompute_cpu_rng(self):
+        paddle.set_device("cpu")
+        with self.assertRaises(RuntimeError):
+            loss_ref, param_ref, grad_ref = run_model(None, recompute_block=[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef220ba1016173144763c375d08229fe5a42cfd5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py
@@ -0,0 +1,139 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import collections
+import paddle
+import paddle.nn as nn
+from paddle.nn.utils import spectral_norm
+
+
+class TestDygraphSpectralNorm(unittest.TestCase):
+    def setUp(self):
+        self.init_test_case()
+        self.set_data()
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 12, 12]], )
+        self.n_power_iterations = 1
+        self.eps = 1e-12
+        self.dim = None
+
+    def set_data(self):
+        self.data = collections.OrderedDict()
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_value = np.random.random(
+                size=[self.batch_size] + data_shape).astype('float32')
+            self.data[data_name] = data_value
+
+    def spectral_normalize(self, weight, u, v, dim, power_iters, eps):
+        shape = weight.shape
+        weight_mat = weight.copy()
+        h = shape[dim]
+        w = np.prod(shape) // h
+        if dim != 0:
+            perm = [dim] + [d for d in range(len(shape)) if d != dim]
+            weight_mat = weight_mat.transpose(perm)
+        weight_mat = weight_mat.reshape((h, w))
+
+        u = u.reshape((h, 1))
+        v = v.reshape((w, 1))
+        for i in range(power_iters):
+            v = np.matmul(weight_mat.T, u)
+            v_norm = np.sqrt((v * v).sum())
+            v = v / (v_norm + eps)
+            u = np.matmul(weight_mat, v)
+            u_norm = np.sqrt((u * u).sum())
+            u = u / (u_norm + eps)
+        sigma = (u * np.matmul(weight_mat, v)).sum()
+        return weight / sigma
+
+    def test_check_output(self):
+        linear = paddle.nn.Conv2D(2, 1, 3)
+        before_weight = linear.weight.numpy().copy()
+        if self.dim == None:
+            if isinstance(linear, (nn.Conv1DTranspose, nn.Conv2DTranspose,
+                                   nn.Conv3DTranspose, nn.Linear)):
+                self.dim = 1
+            else:
+                self.dim = 0
+        else:
+            self.dim = (self.dim + len(before_weight)) % len(before_weight)
+
+        sn = spectral_norm(
+            linear,
+            n_power_iterations=self.n_power_iterations,
+            eps=self.eps,
+            dim=self.dim)
+        u = sn.weight_u.numpy().copy()
+        v = sn.weight_v.numpy().copy()
+        outputs = []
+        for name, data in self.data.items():
+            output = linear(paddle.to_tensor(data))
+            outputs.append(output.numpy())
+        self.actual_outputs = linear.weight.numpy()
+
+        expect_output = self.spectral_normalize(
+            before_weight, u, v, self.dim, self.n_power_iterations, self.eps)
+
+        for expect, actual in zip(expect_output, self.actual_outputs):
+            self.assertTrue(
+                np.allclose(
+                    np.array(actual), np.array(expect), atol=0.001))
+
+
+class TestDygraphWeightNormCase(TestDygraphSpectralNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.n_power_iterations = 1
+        self.eps = 1e-12
+        self.dim = None
+
+
+class TestDygraphWeightNormWithIterations(TestDygraphSpectralNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.n_power_iterations = 2
+        self.eps = 1e-12
+        self.dim = None
+
+
+class TestDygraphWeightNormWithDim(TestDygraphSpectralNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.n_power_iterations = 1
+        self.eps = 1e-12
+        self.dim = 1
+
+
+class TestDygraphWeightNormWithEps(TestDygraphSpectralNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.n_power_iterations = 1
+        self.eps = 1e-10
+        self.dim = None
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index fde7ea4b23801ed8b07ea72e078ed7646ec02aa7..d067a2bd577880a58e757a422c52058661b4eedb 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -204,7 +204,7 @@ class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
 
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
     def init_input_output(self):
-        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.x = np.random.rand(2, 10, 12, 1).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 10, 12, 1)
 
@@ -224,7 +224,7 @@ class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
 
 class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
     def init_input_output(self):
-        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1, 1)
 
@@ -234,7 +234,7 @@ class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
 
 class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
     def init_input_output(self):
-        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1, 1)
 
@@ -353,7 +353,7 @@ class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
 
 class TestElementwiseFP16AddOp_commonuse_add1(TestFP16ElementwiseAddOp):
     def init_input_output(self):
-        self.x = np.random.rand(20, 30, 100).astype(self.dtype)
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(1, 1, 100).astype(self.dtype)
         self.out = self.x + self.y
 
@@ -374,7 +374,7 @@ class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
 class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
-        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
         self.out = self.x + self.y
 
     def init_axis(self):
@@ -384,7 +384,7 @@ class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
 class TestElementwiseAddOp_same_shape_ysize_large(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(10, 1, 12).astype(self.dtype)
-        self.y = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 2, 12).astype(self.dtype)
         self.out = self.x + self.y
 
     def init_axis(self):
@@ -408,13 +408,16 @@ class TestElementwiseAddOpError(unittest.TestCase):
             self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
 
 
-class TestAddOp(unittest.TestCase):
+class TestAddApi(unittest.TestCase):
+    def _executed_api(self, x, y, name=None):
+        return paddle.add(x, y, name)
+
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
             x = fluid.data(name="x", shape=[2, 3], dtype="float32")
             y = fluid.data(name='y', shape=[2, 3], dtype='float32')
 
-            y_1 = paddle.add(x, y, name='add_res')
+            y_1 = self._executed_api(x, y, name='add_res')
             self.assertEqual(('add_res' in y_1.name), True)
 
     def test_declarative(self):
@@ -428,7 +431,7 @@ class TestAddOp(unittest.TestCase):
 
             x = fluid.data(name="x", shape=[3], dtype='float32')
             y = fluid.data(name="y", shape=[3], dtype='float32')
-            z = paddle.add(x, y)
+            z = self._executed_api(x, y)
 
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -442,12 +445,75 @@ class TestAddOp(unittest.TestCase):
             np_y = np.array([1, 5, 2]).astype('float64')
             x = fluid.dygraph.to_variable(np_x)
             y = fluid.dygraph.to_variable(np_y)
-            z = paddle.add(x, y)
+            z = self._executed_api(x, y)
             np_z = z.numpy()
             z_expected = np.array([3., 8., 6.])
             self.assertEqual((np_z == z_expected).all(), True)
 
 
+class TestAddInplaceApi(TestAddApi):
+    def _executed_api(self, x, y, name=None):
+        return x.add_(y, name)
+
+
+class TestAddInplaceBroadcastSuccess(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 4).astype('float')
+        self.y_numpy = np.random.rand(3, 4).astype('float')
+
+    def test_broadcast_success(self):
+        paddle.disable_static()
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+        inplace_result = x.add_(y)
+        numpy_result = self.x_numpy + self.y_numpy
+        self.assertEqual((inplace_result.numpy() == numpy_result).all(), True)
+        paddle.enable_static()
+
+
+class TestAddInplaceBroadcastSuccess2(TestAddInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float')
+        self.y_numpy = np.random.rand(3, 1).astype('float')
+
+
+class TestAddInplaceBroadcastSuccess3(TestAddInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float')
+        self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float')
+
+
+class TestAddInplaceBroadcastError(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(3, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+    def test_broadcast_errors(self):
+        paddle.disable_static()
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+
+        def broadcast_shape_error():
+            x.add_(y)
+
+        self.assertRaises(ValueError, broadcast_shape_error)
+        paddle.enable_static()
+
+
+class TestAddInplaceBroadcastError2(TestAddInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
+class TestAddInplaceBroadcastError3(TestAddInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
 class TestComplexElementwiseAddOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_add"
@@ -520,6 +586,23 @@ class TestRealComplexElementwiseAddOp(TestComplexElementwiseAddOp):
         self.grad_y = self.grad_out
 
 
+class TestBoolAddFloatElementwiseAddop(unittest.TestCase):
+    def test_static_add(self):
+        paddle.enable_static()
+        a = 1.5
+        b = paddle.full([4, 5, 6], True, dtype='bool')
+        c = a + b
+        self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)
+        paddle.enable_static()
+
+    def test_dygraph_add(self):
+        paddle.disable_static()
+        a = 1.5
+        b = paddle.full([4, 5, 6], True, dtype='bool')
+        c = a + b
+        self.assertTrue(c.dtype == core.VarDesc.VarType.FP32)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..6475caf970cba7be5efad60b9a4c094e112175c3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py
@@ -0,0 +1,161 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMax(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_max"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.maximum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False)
+
+    # TODO(ascendrc): Max grad test
+    # def test_check_grad(self):
+    #     if self.dtype == np.float16:
+    #         return
+    #     self.check_grad(['X'], 'Out')
+    #
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMaxFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_max"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.maximum(x, y)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+        }
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestElementwiseMaxNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.maximum(a, b)
+
+            fc_1 = fluid.layers.fc(input=c, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index c5372d5b758a8b2878c0a3070f32fa5db8efa117..2594c96eebd69fcdd88d48e793e48d854b79535a 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 import paddle
+import paddle.fluid as fluid
 from op_test import OpTest, skip_check_grad_ci
 
 
@@ -237,6 +238,111 @@ class TestRealComplexElementwiseSubOp(TestComplexElementwiseSubOp):
         self.grad_y = -self.grad_out
 
 
+class TestSubtractApi(unittest.TestCase):
+    def _executed_api(self, x, y, name=None):
+        return paddle.subtract(x, y, name)
+
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = self._executed_api(x, y, name='subtract_res')
+            self.assertEqual(('subtract_res' in y_1.name), True)
+
+    def test_declarative(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = self._executed_api(x, y)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            z_expected = np.array([1., -2., 2.])
+            self.assertEqual((z_value == z_expected).all(), True)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = fluid.dygraph.to_variable(np_x)
+            y = fluid.dygraph.to_variable(np_y)
+            z = self._executed_api(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([1., -2., 2.])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+
+class TestSubtractInplaceApi(TestSubtractApi):
+    def _executed_api(self, x, y, name=None):
+        return x.subtract_(y, name)
+
+
+class TestSubtractInplaceBroadcastSuccess(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 4).astype('float')
+        self.y_numpy = np.random.rand(3, 4).astype('float')
+
+    def test_broadcast_success(self):
+        paddle.disable_static()
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+        inplace_result = x.subtract_(y)
+        numpy_result = self.x_numpy - self.y_numpy
+        self.assertEqual((inplace_result.numpy() == numpy_result).all(), True)
+        paddle.enable_static()
+
+
+class TestSubtractInplaceBroadcastSuccess2(TestSubtractInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float')
+        self.y_numpy = np.random.rand(3, 1).astype('float')
+
+
+class TestSubtractInplaceBroadcastSuccess3(TestSubtractInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float')
+        self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float')
+
+
+class TestSubtractInplaceBroadcastError(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(3, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+    def test_broadcast_errors(self):
+        paddle.disable_static()
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+
+        def broadcast_shape_error():
+            x.subtract_(y)
+
+        self.assertRaises(ValueError, broadcast_shape_error)
+        paddle.enable_static()
+
+
+class TestSubtractInplaceBroadcastError2(TestSubtractInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
+class TestSubtractInplaceBroadcastError3(TestSubtractInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_empty_like_op.py b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
index 32d732d9a809950ade5484431b833056336acd54..385a0c0b6e84cc86e92408ec5b480089a36afdab 100644
--- a/python/paddle/fluid/tests/unittests/test_empty_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_empty_like_op.py
@@ -38,7 +38,7 @@ class TestEmptyLikeAPICommon(unittest.TestCase):
         if data_type in ['float32', 'float64', 'int32', 'int64']:
             max_value = np.nanmax(out)
             min_value = np.nanmin(out)
-            always_non_full_zero = max_value > min_value
+            always_non_full_zero = max_value >= min_value
             always_full_zero = max_value == 0.0 and min_value == 0.0
             self.assertTrue(always_full_zero or always_non_full_zero,
                             'always_full_zero or always_non_full_zero.')
@@ -146,6 +146,8 @@ class TestEmptyLikeAPI_Static(TestEmptyLikeAPICommon):
         self.init_config()
 
     def test_static_graph(self):
+        paddle.enable_static()
+
         dtype = 'float32'
 
         train_program = Program()
@@ -167,6 +169,8 @@ class TestEmptyLikeAPI_Static(TestEmptyLikeAPICommon):
         self.dst_shape = x.shape
         self.__check_out__(res[0])
 
+        paddle.disable_static()
+
     def init_config(self):
         self.x_shape = (200, 3)
         self.data_x_shape = [200, 3]
diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py
index a325ffe1d0ef4636bad6329099c359bdf79d5dfc..edda6da655ddd9fe7af33fdf574c65f457327ec8 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
@@ -27,8 +27,10 @@ class TestExpandOpRank1(OpTest):
     def setUp(self):
         self.op_type = "expand"
         self.init_data()
+        self.dtype = "float32" if fluid.core.is_compiled_with_rocm(
+        ) else "float64"
 
-        self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")}
+        self.inputs = {'X': np.random.random(self.ori_shape).astype(self.dtype)}
         self.attrs = {'expand_times': self.expand_times}
         output = np.tile(self.inputs['X'], self.expand_times)
         self.outputs = {'Out': output}
@@ -79,13 +81,16 @@ class TestExpandOpRank1_tensor_attr(OpTest):
     def setUp(self):
         self.op_type = "expand"
         self.init_data()
+        self.dtype = "float32" if fluid.core.is_compiled_with_rocm(
+        ) else "float64"
+
         expand_times_tensor = []
         for index, ele in enumerate(self.expand_times):
             expand_times_tensor.append(("x" + str(index), np.ones(
                 (1)).astype('int32') * ele))
 
         self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float64"),
+            'X': np.random.random(self.ori_shape).astype(self.dtype),
             'expand_times_tensor': expand_times_tensor,
         }
         self.attrs = {"expand_times": self.infer_expand_times}
@@ -123,9 +128,11 @@ class TestExpandOpRank1_tensor(OpTest):
     def setUp(self):
         self.op_type = "expand"
         self.init_data()
+        self.dtype = "float32" if fluid.core.is_compiled_with_rocm(
+        ) else "float64"
 
         self.inputs = {
-            'X': np.random.random(self.ori_shape).astype("float64"),
+            'X': np.random.random(self.ori_shape).astype(self.dtype),
             'ExpandTimes': np.array(self.expand_times).astype("int32"),
         }
         self.attrs = {}
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 01f0abe0f217c342c4ea14cb55b4c40b5d273284..1d7bfc9f6963c608f7e385781d4ea13d6ede25b2 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -166,12 +166,14 @@ class TestMovingAverageAbsMaxScaleOp(OpTest):
         accum[0] = 1
         state = np.zeros(1).astype("float32")
         state[0] = 1
+        x = np.random.random((8, 16, 7, 7)).astype("float32")
         self.inputs = {
-            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'X': x,
             'InAccum': accum,
             'InState': state,
         }
 
+        out = x
         out_accum = np.zeros(1).astype("float32")
         out_state = np.zeros(1).astype("float32")
         out_scale = np.zeros(1).astype("float32")
@@ -180,6 +182,7 @@ class TestMovingAverageAbsMaxScaleOp(OpTest):
         out_state[0] = self.attrs['moving_rate'] * state[0] + 1
         out_scale = out_accum / out_state
         self.outputs = {
+            'Out': out,
             'OutAccum': out_accum,
             'OutState': out_state,
             'OutScale': out_scale,
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index c305f71aa53657761cd017e8762b744213708900..770b6d3e92e1656d40c6ce1f7452fef8186288f2 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid.core as core
@@ -375,15 +375,9 @@ class TestFillConstantOpError(unittest.TestCase):
                 out=x1)
 
             # The argument dtype of fill_constant_op must be one of bool, float16,
-            #float32, float64, int32 or int64
+            #float32, float64, uint8, int32 or int64
             x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
 
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[1],
-                value=5,
-                dtype='uint8')
             self.assertRaises(
                 TypeError,
                 fluid.layers.fill_constant,
@@ -431,5 +425,31 @@ class TestFillConstantOpError(unittest.TestCase):
             self.assertRaises(TypeError, test_shape_tensor_list_dtype)
 
 
+class TestFillConstantOp_ValueTensorBf16(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.init_data()
+
+        self.inputs = {
+            "ShapeTensor": np.array(self.shape).astype("int32"),
+            'ValueTensor':
+            convert_float_to_uint16(np.array([self.value]).astype("float32"))
+        }
+        self.attrs = {'value': self.value, 'dtype': core.VarDesc.VarType.BF16}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.value = 3.0
+        self.dtype = np.uint16
+        self.mkldnn_data_type = "bfloat16"
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flatten2_op.py b/python/paddle/fluid/tests/unittests/test_flatten2_op.py
index 189a63a0868459c839782dac13c9bf462959927b..0d50c65558a91841160d8db155675d167499c38d 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten2_op.py
@@ -81,7 +81,7 @@ class TestFlatten2OpError(unittest.TestCase):
         self.assertRaises(TypeError, test_Variable)
 
         def test_type():
-            # dtype must be float32, float64, int8, int32, int64.
+            # dtype must be float32, float64, int8, int32, int64, uint8.
             x2 = fluid.layers.data(
                 name='x2', shape=[3, 2, 4, 5], dtype='float16')
             fluid.layers.flatten(x2, axis=1)
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index aa85eb3df352707f4ebcd47f9e982813f02f6a24..bc9ff3697717d1ecca9d0e076088669d6ce26d69 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -166,11 +166,12 @@ class TestFlatten2OpError(unittest.TestCase):
         self.assertRaises(ValueError, test_ValueError3)
 
         def test_type():
-            # dtype must be float32, float64, int8, int32, int64.
+            # dtype must be float32, float64, int8, int32, int64, uint8.
             x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
                            image_shape[3]).reshape(image_shape) / 100.
             x2 = x2.astype('float16')
-            x2_var = paddle.fluid.data(name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2_var = paddle.fluid.data(
+                name='x2', shape=[3, 2, 4, 5], dtype='float16')
             paddle.flatten(x2_var)
 
         self.assertRaises(TypeError, test_type)
@@ -181,6 +182,30 @@ class TestFlatten2OpError(unittest.TestCase):
         self.assertRaises(ValueError, test_InputError)
 
 
+class TestStaticFlattenPythonAPI(unittest.TestCase):
+    def execute_api(self, x, start_axis=0, stop_axis=-1):
+        return paddle.flatten(x, start_axis, stop_axis)
+
+    def test_static_api(self):
+        paddle.enable_static()
+        np_x = np.random.rand(2, 3, 4, 4).astype('float32')
+
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, paddle.static.Program()):
+            x = paddle.static.data(
+                name="x", shape=[2, 3, 4, 4], dtype='float32')
+            out = self.execute_api(x, start_axis=-2, stop_axis=-1)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+        fetch_out = exe.run(main_prog, feed={"x": np_x}, fetch_list=[out])
+        self.assertTrue((2, 3, 16) == fetch_out[0].shape)
+
+
+class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI):
+    def execute_api(self, x, start_axis=0, stop_axis=-1):
+        return x.flatten_(start_axis, stop_axis)
+
+
 class TestFlattenPython(unittest.TestCase):
     def test_python_api(self):
         image_shape = (2, 3, 4, 4)
@@ -203,5 +228,23 @@ class TestFlattenPython(unittest.TestCase):
         self.assertTrue((2, 3, 16) == res_shape)
 
 
+class TestDygraphInplaceFlattenPython(unittest.TestCase):
+    def test_python_api(self):
+        image_shape = (2, 3, 4, 4)
+        x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                      image_shape[3]).reshape(image_shape) / 100.
+        x = x.astype('float32')
+
+        def test_Negative():
+            paddle.disable_static()
+            img = paddle.to_tensor(x)
+            out = img.flatten_(start_axis=-2, stop_axis=-1)
+            return out.numpy().shape
+
+        res_shape = test_Negative()
+        self.assertTrue((2, 3, 16) == res_shape)
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
index 869ca41a1923daa099112df95f9b8e3b520883d7..6930a330a7c315780c11fe40cdc0ae90803d4fe6 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
@@ -70,6 +70,8 @@ class TestFleetAMPInit(unittest.TestCase):
             optimizer = fleet.distributed_optimizer(optimizer)
             optimizer.minimize(cost)
 
+        loss_scale = optimizer.get_loss_scaling()
+
         place = paddle.CUDAPlace(0)
 
         exe = paddle.static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d88a8e1155e4e15532aa98381500f04e50c86d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_ascend_utils.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import os
+import time
+import six
+import copy
+import json
+import unittest
+import paddle.fluid as fluid
+
+import paddle.distributed.fleet.ascend_utils as ascend_utils
+
+RANK_TABLE_JSON = {
+    "status": "completed",
+    "version": "1.0",
+    "server_count": "1",
+    "server_list": [{
+        "server_id": "127.0.0.1",
+        "device": [{
+            "device_id": "0",
+            "device_ip": "192.1.184.23",
+            "rank_id": "0"
+        }, {
+            "device_id": "1",
+            "device_ip": "192.2.21.93",
+            "rank_id": "1"
+        }]
+    }]
+}
+
+
+class TestAscendUtil(unittest.TestCase):
+    def test_get_cloud_cluster(self):
+        cluster, pod = ascend_utils.get_cloud_cluster()
+        self.assertTrue(cluster)
+        self.assertTrue(pod)
+
+        with open('rank_table_file.json', 'w') as f:
+            json.dump(RANK_TABLE_JSON, f)
+        rank_table_file = "./rank_table_file.json"
+        cluster, pod = ascend_utils.get_cloud_cluster(
+            rank_table_file=rank_table_file)
+        self.assertTrue(cluster)
+        self.assertTrue(pod)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 31771ddbd687449fc8c96e60d8524e8f2e5024be..52895217d3f900e167001640efe3f0de45581b2a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -73,6 +73,17 @@ class TestStrategyConfig(unittest.TestCase):
         strategy.pipeline_configs = configs
         self.assertEqual(strategy.pipeline_configs["accumulate_steps"], 2)
 
+    def test_hybrid_parallel_configs(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": 2,
+            "pp_degree": 4
+        }
+        self.assertEqual(strategy.hybrid_configs["dp_degree"], 1)
+        self.assertEqual(strategy.hybrid_configs["mp_degree"], 2)
+        self.assertEqual(strategy.hybrid_configs["pp_degree"], 4)
+
     def test_localsgd(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.localsgd = True
@@ -179,6 +190,15 @@ class TestStrategyConfig(unittest.TestCase):
         with self.assertRaises(ValueError):
             strategy.last_comm_group_size_MB = -1
 
+    def test_find_unused_parameters(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.find_unused_parameters = True
+        self.assertEqual(strategy.find_unused_parameters, True)
+        strategy.find_unused_parameters = False
+        self.assertEqual(strategy.find_unused_parameters, False)
+        strategy.find_unused_parameters = "True"
+        self.assertEqual(strategy.find_unused_parameters, False)
+
     def test_fuse_grad_size_in_TFLOPS(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy._fuse_grad_size_in_TFLOPS = 0.1
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a54334692214c8ac3ced731c450a51a54478104f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+RANK_TABLE_FILE_NAME="rank_table_file.json"
+cat > ${RANK_TABLE_FILE_NAME} <<EOF
+{
+    "status": "completed",
+    "version": "1.0",
+    "server_count": "1",
+    "server_list": [
+        {
+            "server_id": "127.0.0.1",
+            "device": [
+                {
+                    "device_id": "0",
+                    "device_ip": "192.1.184.23",
+                    "rank_id": "0"
+                },
+                {
+                    "device_id": "1",
+                    "device_ip": "192.2.21.93",
+                    "rank_id": "1"
+                }
+            ]
+        }
+    ]
+}
+EOF
+
+# set ascend rank table file env
+export RANK_TABLE_FILE="${PWD}/${RANK_TABLE_FILE_NAME}"
+
+# use ascend
+echo "begin test use ascend npu"
+
+distributed_args="--run_mode=collective --log_dir=testlog"
+python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
+
+str1="selected_accelerators:0 selected_npus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6170 trainer_id:0 device_ids:0,1 device_id:0"
+str2="selected_accelerators:1 selected_npus:1 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171 trainers_num:2 current_endpoint:127.0.0.1:6171 trainer_id:1 device_ids:0,1 device_id:1"
+file_0="multi_process_fleetlaunchascend.check_0.log"
+file_1="multi_process_fleetlaunchascend.check_1.log"
+
+echo "paddlecloud params test"
+if grep -q "$str1" "$file_0"; then
+    echo "find trainer 0"
+else
+    echo "not find trainer 0"
+    exit -1
+fi
+
+if grep -q "$str2" "$file_1"; then
+    echo "find trainer 1"
+else
+    echo "not find trainer 1"
+    exit -1
+fi
+
+# test async poll process
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
+if [ -f $file_1 ]; then
+    rm $file_1
+fi
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh
index 21875851bf53040ead644649ac9ea651531e951b..0f28be614c085e46f99d94124cb9755b73948117 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh
@@ -16,27 +16,25 @@
 
 set -e
 
-function test_launch_ps(){
-    server_port_0=${PADDLE_DIST_UT_PORT}
-    server_port_1=$(( PADDLE_DIST_UT_PORT + 1 ))
-    echo "server_port_0:${server_port_0} server_port_1=${server_port_1}"
-    python -m paddle.distributed.fleet.launch --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
-    if grep -q "server are killed" ut.elog; then
-        echo "test pserver launch succeed"
-    else
-        echo "test pserver launch failed"
-        exit -1
-    fi
+server_port_00=${PADDLE_DIST_UT_PORT}
+server_port_10=$(( PADDLE_DIST_UT_PORT + 1 ))
+worker_port_00=$(( PADDLE_DIST_UT_PORT + 2 ))
+worker_port_10=$(( PADDLE_DIST_UT_PORT + 3 ))
 
-    python -m paddle.distributed.fleet.launch --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog
-    if grep -q "server are killed" ut.elog; then
-        echo "test pserver launch succeed"
-    else
-        echo "test pserver launch failed"
-        exit -1
-    fi
+server_port_01=$(( PADDLE_DIST_UT_PORT + 4 ))
+server_port_11=$(( PADDLE_DIST_UT_PORT + 5 ))
+worker_port_01=$(( PADDLE_DIST_UT_PORT + 6 ))
+worker_port_11=$(( PADDLE_DIST_UT_PORT + 7 ))
+
+heter_worker_port_0=$(( PADDLE_DIST_UT_PORT + 8 ))
+heter_worker_port_1=$(( PADDLE_DIST_UT_PORT + 9 ))
+
+function test_launch_ps(){
 
-    python -m paddle.distributed.fleet.launch --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog
+    python -m paddle.distributed.fleet.launch \
+        --servers="127.0.0.1:${server_port_00},127.0.0.1:${server_port_10}" \
+        --workers="127.0.0.1:${worker_port_00},127.0.0.1:${worker_port_10}" \
+        fleet_ps_training.py 2> ut.elog
     if grep -q "server are killed" ut.elog; then
         echo "test pserver launch succeed"
     else
@@ -46,7 +44,11 @@ function test_launch_ps(){
 }
 
 function test_launch_ps_heter(){
-    python -m paddle.distributed.fleet.launch --server_num=2 --worker_num=2 --heter_worker_num=2 fleet_ps_training.py 2> ut.elog
+    python -m paddle.distributed.fleet.launch \
+        --servers="127.0.0.1:${server_port_01},127.0.0.1:${server_port_11}" \
+        --workers="127.0.0.1:${worker_port_01},127.0.0.1:${worker_port_11}" \
+        --heter_workers="127.0.0.1:${heter_worker_port_0},127.0.0.1:${heter_worker_port_1}" \
+        fleet_ps_training.py 2> ut.elog
     if grep -q "server are killed" ut.elog; then
         echo "test heter pserver launch succeed"
     else
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..604109b262d6ccb380ca3586f4586b13aaa75843
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_raw_program_meta_optimizer.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+
+paddle.enable_static()
+
+
+class TestFleetMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def test_pipeline_optimizer(self):
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        input_x = paddle.fluid.layers.data(
+            name="x", shape=[32], dtype='float32')
+        input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
+        fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+
+        fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+        prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
+        cost = paddle.fluid.layers.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.without_graph_optimization = True
+
+        optimizer = paddle.fluid.optimizer.Adam(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 5da7e627f8707d94cd0e01f17ff14484ac18f4a2..be5e87b9d344bb4e56955996c2a730d297373279 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -45,6 +45,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
                 "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
             ]))
+
         self.assertEqual(ops, [
             'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
@@ -55,9 +56,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'momentum',
+            'momentum', 'momentum'
         ])
 
     def test_sharding_amp_optimizer(self):
@@ -82,6 +83,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0",
                 "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0"
             ]))
+
         self.assertEqual(ops, [
             'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
             'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast',
@@ -94,11 +96,10 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast',
             'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
             'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad',
-            'c_sync_calc_stream', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_sync_comm_stream', 'cast', 'cast', 'cast',
-            'check_finite_and_unscale', 'cast', 'c_sync_calc_stream',
-            'c_allreduce_max', 'c_sync_comm_stream', 'cast',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_sync_comm_stream', 'cast', 'cast', 'cast',
+            'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast',
             'update_loss_scaling', 'momentum', 'momentum', 'momentum'
         ])
 
@@ -124,6 +125,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
                 "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
             ]))
+
         self.assertEqual(ops, [
             'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
@@ -134,10 +136,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'mul',
             'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
             'mul', 'elementwise_add', 'tanh_grad', 'elementwise_add_grad',
-            'mul_grad', 'c_sync_calc_stream', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_sync_comm_stream',
-            'momentum', 'momentum', 'momentum'
+            'mul_grad', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_amp_recompute_optimizer(self):
@@ -167,29 +168,27 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0",
                 "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0"
             ]))
-
         self.assertEqual(ops, [
-            'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
+            'cast', 'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
-            'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh',
-            'cast', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast',
-            'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
-            'mean', 'elementwise_mul', 'fill_constant', 'scale',
-            'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'cast',
-            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast',
-            'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh_grad',
-            'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
-            'cast', 'elementwise_add', 'cast', 'tanh_grad', 'cast',
+            'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add',
+            'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
+            'fill_constant', 'scale', 'elementwise_mul_grad', 'mean_grad',
+            'cross_entropy_grad2', 'cast', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh_grad', 'cast',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh_grad', 'cast',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'cast', 'cast', 'cast',
-            'check_finite_and_unscale', 'cast', 'c_sync_calc_stream',
-            'c_allreduce_max', 'c_sync_comm_stream', 'cast',
-            'update_loss_scaling', 'momentum', 'momentum', 'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast',
+            'cast', 'cast', 'check_finite_and_unscale', 'cast',
+            'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
+            'momentum', 'momentum'
         ])
 
     def test_sharding_weight_decay(self):
@@ -227,10 +226,10 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'scale', 'sum', 'scale', 'sum', 'scale',
-            'sum', 'momentum', 'momentum', 'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'scale',
+            'sum', 'scale', 'sum', 'scale', 'sum', 'momentum', 'momentum',
+            'momentum'
         ])
 
     def test_sharding_gradient_clip(self):
@@ -253,6 +252,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
                 "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
             ]))
+
         self.assertEqual(ops, [
             'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
@@ -263,14 +263,12 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'square', 'reduce_sum', 'square',
-            'reduce_sum', 'square', 'reduce_sum', 'sum', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_sync_comm_stream', 'sqrt', 'fill_constant',
-            'elementwise_max', 'elementwise_div', 'elementwise_mul',
-            'elementwise_mul', 'elementwise_mul', 'momentum', 'momentum',
-            'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'square',
+            'reduce_sum', 'square', 'reduce_sum', 'square', 'reduce_sum', 'sum',
+            'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
+            'elementwise_div', 'elementwise_mul', 'elementwise_mul',
+            'elementwise_mul', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_clone_for_test(self):
@@ -281,7 +279,8 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
         self.optimizer(avg_cost, strategy, train_prog, startup_prog)
         sharding.utils.comm_analyse(train_prog)
         test_prog = train_prog.clone(for_test=True)
-        sharding.utils.add_sync_comm(test_prog, strategy)
+        # assume sharding_ring_id = 1
+        sharding.utils.add_sync_comm(test_prog, 1)
         ops = [op.type for op in test_prog.global_block().ops]
 
         self.assertEqual(ops, [
@@ -293,5 +292,296 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
         ])
 
 
+class TestFleetMetaOptimizer(TestFleetMetaOptimizer):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "3"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002,127.0.0.1:36003,127.0.0.1:36004"
+
+        # pre-assigned ring id
+        self.mp_ring_id = 0
+        self.sharding_ring_id = 1
+        self.dp_ring_id = 2
+        self.global_ring_id = 3
+        self.pp_ring_id = 20
+
+    def test_sharding_with_mp(self):
+        # NOTE(JZ-LIANG) MP parallelism need user to build model with MP API
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, _ = self.net(train_prog, startup_prog)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.2,
+            "segment_anchors": None,
+            "sharding_degree": 2,
+            "hybrid_dp": False,
+            "gradient_merge_acc_step": 1,
+            "mp_degree": 2
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # should has ring id for MP
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(self.mp_ring_id, created_ring_ids)
+
+        # check correctness of MP group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+    def test_sharding_hybrid_dp(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, _ = self.net(train_prog, startup_prog)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.2,
+            "segment_anchors": None,
+            "sharding_degree": 2,
+            "dp_degree": 2,
+            "hybrid_dp": True,
+            "gradient_merge_acc_step": 1,
+            "mp_degree": 1
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # check ring id for outter dp
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(self.dp_ring_id, created_ring_ids)
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of dp group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+        # check loss scale for sharding hybrid dp
+        scale_ = -1
+        for op in main_prog_ops:
+            if op.type == "scale":
+                scale_ = float(op.desc.attr("scale"))
+        self.assertEqual(scale_, 0.25)
+
+        # check program (allreudce)
+        ops = [op.type for op in main_prog_ops]
+        self.assertEqual(ops, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
+            'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
+            'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
+        ])
+
+    def test_sharding_hybrid_dp_gm(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, _ = self.net(train_prog, startup_prog)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.2,
+            "segment_anchors": None,
+            "sharding_degree": 2,
+            "dp_degree": 2,
+            "hybrid_dp": True,
+            "gradient_merge_acc_step": 4,
+            "mp_degree": 1
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # check ring id for outter dp
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(self.dp_ring_id, created_ring_ids)
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of dp group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+        # check program
+        fw_bw_ops = [op.type for op in train_prog.blocks[0].ops]
+        opt_ops = [op.type for op in train_prog.blocks[2].ops]
+        self.assertEqual(fw_bw_ops, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
+            'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
+            'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
+            'elementwise_add', 'elementwise_add', 'elementwise_add',
+            'increment', 'elementwise_mod', 'equal', 'conditional_block'
+        ])
+        self.assertEqual(opt_ops, [
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'scale',
+            'scale', 'scale', 'momentum', 'momentum', 'momentum',
+            'fill_constant', 'fill_constant', 'fill_constant'
+        ])
+
+        # # check loss scale for gradient merge
+        scale_ = -1
+        for op in train_prog.blocks[2].ops:
+            if op.type == "scale":
+                scale_ = float(op.desc.attr("scale"))
+                self.assertEqual(scale_, 0.25)
+
+    def test_sharding_with_pp(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.pp_net(train_prog, startup_prog)
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.1,
+            "sharding_degree": 2,
+            "hybrid_dp": False,
+            "gradient_merge_acc_step": 4,
+            "mp_degree": 1,
+            "pp_degree": 2
+        }
+        strategy.pipeline = True
+        strategy.pipeline_configs = {
+            "schedule_mode": "1F1B",
+            "micro_batch_size": 2,
+            "accumulate_steps": 4,
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        train_prog = train_prog._pipeline_opt['section_program']
+        startup_prog = startup_prog._pipeline_opt['startup_program']
+
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # check program
+        startup_prog_op_types = [op.type for op in startup_prog_ops]
+        main_prog_op_types = [op.type for op in main_prog_ops]
+        print(startup_prog_op_types)
+        self.assertEqual(startup_prog_op_types, [
+            'fill_constant', 'uniform_random', 'fill_constant',
+            'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
+            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
+            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
+        ])
+
+        self.assertEqual(main_prog_op_types, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_sync_comm_stream', 'recv_v2', 'mul',
+            'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
+            'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax',
+            'cross_entropy2', 'mean', 'fill_constant', 'scale', 'scale',
+            'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
+            'c_sync_comm_stream', 'fill_constant', 'sum', 'fill_constant',
+            'sum', 'fill_constant', 'sum', 'fill_constant', 'sum',
+            'fill_constant', 'sum', 'momentum', 'momentum', 'momentum',
+            'momentum', 'momentum'
+        ])
+
+        # should has ring id for pp
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(self.sharding_ring_id, created_ring_ids)
+        self.assertIn(self.pp_ring_id, created_ring_ids)
+
+        # check correctness of pp group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
index 51c12375948f591a70d7f3212462ea590d864f03..09de4867ef9f4641c864b2e93e32e7c8c8dbaf38 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_utils.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
@@ -24,7 +24,6 @@ import sys
 from paddle.dataset.common import download, DATA_HOME
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.utils.fleet_barrier_util import check_all_trainers_ready
 from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
 import paddle.fluid.incubate.fleet.utils.utils as utils
 
@@ -50,15 +49,6 @@ class TestFleetUtils(unittest.TestCase):
         fleet_util_transpiler = FleetUtil(mode="transpiler")
         self.assertRaises(Exception, FleetUtil, "other")
 
-    def test_fleet_barrier(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=1,
-            server_endpoints=['127.0.0.1'])
-        fleet.init(role)
-        check_all_trainers_ready("/ready_path/", 0)
-
     def test_program_type_trans(self):
         data_dir = self.download_files()
         program_dir = os.path.join(data_dir, self.pruned_dir)
diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py
index 2d850db78377226e0f4316d3c9164de8cf46ceb6..19944aba46df0a3eeb9d95db8071535bc536a411 100644
--- a/python/paddle/fluid/tests/unittests/test_full_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_op.py
@@ -84,10 +84,7 @@ class TestFullOpError(unittest.TestCase):
                 TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint4')
 
             # The argument dtype of full must be one of bool, float16,
-            #float32, float64, int32 or int64
-
-            self.assertRaises(
-                TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint8')
+            #float32, float64, uint8, int32 or int64
 
             # The argument shape's type of full_op  must be list, tuple or Variable.
             def test_shape_type():
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
index 1e25b8034da0a3449453a759da9fe501c07944c1..c241fc65d9b82a072423f77c60ba31749fb7163a 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -18,8 +18,8 @@ import unittest
 import numpy as np
 import math
 from op_test import OpTest
-from test_gru_op import gru
-from test_fusion_lstm_op import fc, ACTIVATION
+from paddle.fluid.tests.unittests.test_gru_op import gru
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
 
 
 def fusion_gru(
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
index 3928b6fa034efd123599149651f41a863cb6263e..4899927a7694f4be0cba9e17d056ad30d776ccc5 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
-from test_lstm_op import lstm, ACTIVATION
+from paddle.fluid.tests.unittests.test_lstm_op import lstm, ACTIVATION
 
 
 def fc(x, w, b):
diff --git a/python/paddle/fluid/tests/unittests/test_glu.py b/python/paddle/fluid/tests/unittests/test_glu.py
index 63818d8ac50f2a32741d8434693813ffac5c7099..25f1975db0c5290ab538fa0204e38141d1077bbf 100644
--- a/python/paddle/fluid/tests/unittests/test_glu.py
+++ b/python/paddle/fluid/tests/unittests/test_glu.py
@@ -17,6 +17,9 @@ from paddle import fluid
 import paddle.fluid.dygraph as dg
 import unittest
 
+import paddle
+from paddle.nn import functional as F
+
 
 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))
@@ -48,5 +51,25 @@ class TestGLUCase(unittest.TestCase):
             self.check_identity(fluid.CUDAPlace(0))
 
 
+class TestGLUV2(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(5, 20)
+        self.dim = -1
+        self.out = glu(self.x, self.dim)
+
+    def check_identity(self, place):
+        with dg.guard(place):
+            x_var = paddle.to_tensor(self.x)
+            y_var = F.glu(x_var, self.dim)
+            y_np = y_var.numpy()
+
+        np.testing.assert_allclose(y_np, self.out)
+
+    def test_case(self):
+        self.check_identity(fluid.CPUPlace())
+        if fluid.is_compiled_with_cuda():
+            self.check_identity(fluid.CUDAPlace(0))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
index d854372bbc6e7d713c823fac540bb2bd425c9f91..e528e742a277a0d21657b35dd80a04e70344620b 100644
--- a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
+++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
@@ -26,11 +26,15 @@ from paddle.fluid import core
 class TestGPUPackagePaddle(unittest.TestCase):
     def test_import_paddle(self):
         if core.is_compiled_with_cuda():
-            os.environ['CUDA_VISIBLE_DEVICES'] = ''
+            if core.is_compiled_with_rocm():
+                os.environ['HIP_VISIBLE_DEVICES'] = ''
+            else:
+                os.environ['CUDA_VISIBLE_DEVICES'] = ''
             test_file = 'test_no_gpu_run_rand.py'
             with open(test_file, 'w') as wb:
                 cmd_test = """
 import paddle
+paddle.utils.run_check()
 x = paddle.rand([3,4])
 assert x.place.is_gpu_place() is False, "There is no CUDA device, but Tensor's place is CUDAPlace"
 """
@@ -49,7 +53,7 @@ assert x.place.is_gpu_place() is False, "There is no CUDA device, but Tensor's p
             assert 'CPU device will be used by default' in str(
                 stderr
             ), "GPU version Paddle is installed. But CPU device can't be used when CUDA device is not set properly"
-            assert "Error" not in str(
+            assert "AssertionError" not in str(
                 stderr
             ), "There is no CUDA device, but Tensor's place is CUDAPlace"
 
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index bf2f9518fb0c720556b7eecdf5b286dea0fff96c..d5056bd11cf0845e8694e75d805f4197a5fb2024 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -182,7 +183,7 @@ class TestGridSamplerOp(OpTest):
         self.align_corners = True
         self.padding_mode = "zeros"
         self.mode = "bilinear"
-        self.use_cudnn = True
+        self.use_cudnn = False if core.is_compiled_with_rocm() else True
 
 
 class Case1(TestGridSamplerOp):
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 3ea47a5d690ea43be6cb0242a916cb88366f88fd..3ec943ef2e04a28324a034676394e3fb02caceba 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -19,7 +19,7 @@ import numpy as np
 import math
 import functools
 from op_test import OpTest
-from test_lstm_op import ACTIVATION
+from paddle.fluid.tests.unittests.test_lstm_op import ACTIVATION
 from paddle import fluid
 from paddle.fluid import Program, program_guard
 
diff --git a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
index eb1fed81cbee58dac1eba154455e1168861eddce..9f18ec9843d7a40c8e73663e56405ef4b864f25f 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py
@@ -44,8 +44,9 @@ class TestGRUOp(OpTest):
 
     def setUp(self):
         self.op_type = "rnn"
-        self.dtype = "float64"
-        self.sequence_length = np.array(
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
+        self.sequence_length = None if core.is_compiled_with_rocm(
+        ) else np.array(
             [12, 11, 10, 9, 8, 7, 6, 5], dtype=np.int32)
         self.num_layers = 1
         self.is_bidirec = False
@@ -83,6 +84,24 @@ class TestGRUOp(OpTest):
 
         output, last_hidden = rnn1(input, sequence_length=self.sequence_length)
 
+        if core.is_compiled_with_rocm():
+
+            def rocm_rnn_get_place():
+                places = [core.CUDAPlace(0)]
+                return places
+
+            self._get_places = rocm_rnn_get_place
+
+            if self.is_bidirec:
+                for i in range(0, len(flat_w), 4):
+                    flat_w[i + 1], flat_w[i + 2] = flat_w[i + 2], flat_w[i + 1]
+
+            for i in range(len(flat_w)):
+                w = np.split(flat_w[i][1], 3, 0)
+                w = [w[1], w[0], w[2]]
+                w = np.concatenate(w)
+                flat_w[i] = (flat_w[i][0], w)
+
         init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
                            self.hidden_size)).astype(self.dtype)
 
diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
index a570e266072adc167f973cd4b22eda3ecb5e5abb..74afa7db2899b4405e11fc9eb8fbea2f383428cb 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
@@ -121,12 +121,12 @@ class TestGRUUnitOp(OpTest):
         self.op_type = 'gru_unit'
         self.inputs = {
             'Input': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size * 3)).astype('float64'),
+                -0.1, 0.1, (batch_size, frame_size * 3)).astype(self.dtype),
             'HiddenPrev': np.random.uniform(
-                -0.1, 0.1, (batch_size, frame_size)).astype('float64'),
+                -0.1, 0.1, (batch_size, frame_size)).astype(self.dtype),
             'Weight': np.random.uniform(
                 -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size),
-                (frame_size, frame_size * 3)).astype('float64'),
+                (frame_size, frame_size * 3)).astype(self.dtype),
         }
         self.attrs = {
             'activation': GRUActivationType.tanh,
@@ -161,12 +161,14 @@ class TestGRUUnitOp(OpTest):
         else:
             h = u * c + (1 - u) * h_p
         self.outputs = {
-            'Gate': g.astype('float64'),
-            'ResetHiddenPrev': r_h_p.astype('float64'),
-            'Hidden': h.astype('float64')
+            'Gate': g.astype(self.dtype),
+            'ResetHiddenPrev': r_h_p.astype(self.dtype),
+            'Hidden': h.astype(self.dtype)
         }
 
     def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
         self.set_inputs()
         self.set_outputs()
 
@@ -179,6 +181,8 @@ class TestGRUUnitOp(OpTest):
 
 class TestGRUUnitOpOriginMode(TestGRUUnitOp):
     def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
         self.set_inputs(origin_mode=True)
         self.set_outputs(origin_mode=True)
 
@@ -189,7 +193,7 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
         frame_size = self.frame_size
         super(TestGRUUnitOpWithBias, self).set_inputs()
         self.inputs['Bias'] = np.random.uniform(
-            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
+            -0.1, 0.1, (1, frame_size * 3)).astype(self.dtype)
         self.attrs = {
             'activation': GRUActivationType.identity,
             'gate_activation': GRUActivationType.sigmoid,
@@ -207,6 +211,8 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp):
 
 class TestGRUUnitOpWithBiasOriginMode(TestGRUUnitOpWithBias):
     def setUp(self):
+        self.dtype = 'float32' if fluid.core.is_compiled_with_rocm(
+        ) else 'float64'
         self.set_inputs(origin_mode=True)
         self.set_outputs(origin_mode=True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4c469599d72c05849b34440faa586fa5f66d7e4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+import unittest
+from paddle.distributed import fleet
+import numpy as np
+
+
+class TestCommunicateTopology(unittest.TestCase):
+    def test_topology(self):
+        topo = fleet.CommunicateTopology(["dp", "mp", "pp"], [2, 2, 2])
+
+        # test get_comm_list
+        dp_comm_list = [[0, 4], [1, 5], [2, 6], [3, 7]]
+        mp_comm_list = [[0, 2], [1, 3], [4, 6], [5, 7]]
+        pp_comm_list = [[0, 1], [2, 3], [4, 5], [6, 7]]
+
+        np.testing.assert_array_equal(dp_comm_list, topo.get_comm_list("dp"))
+        np.testing.assert_array_equal(mp_comm_list, topo.get_comm_list("mp"))
+        np.testing.assert_array_equal(pp_comm_list, topo.get_comm_list("pp"))
+
+        # test get_hybrid_group_names
+        parallel_names = ["dp", "mp", "pp"]
+        np.testing.assert_array_equal(parallel_names,
+                                      topo.get_hybrid_group_names())
+
+        # test get_dims
+        np.testing.assert_array_equal(2, topo.get_dim("dp"))
+        np.testing.assert_array_equal(2, topo.get_dim("mp"))
+        np.testing.assert_array_equal(2, topo.get_dim("pp"))
+
+        # test world size
+        self.assertEqual(topo.world_size(), 8)
+
+        # test get_rank
+        self.assertEqual(topo.get_rank(dp=0, mp=0, pp=0), 0)
+        self.assertEqual(topo.get_rank(dp=0, mp=0, pp=1), 1)
+        self.assertEqual(topo.get_rank(dp=0, mp=1, pp=0), 2)
+        self.assertEqual(topo.get_rank(dp=0, mp=1, pp=1), 3)
+        self.assertEqual(topo.get_rank(dp=1, mp=0, pp=0), 4)
+        self.assertEqual(topo.get_rank(dp=1, mp=0, pp=1), 5)
+        self.assertEqual(topo.get_rank(dp=1, mp=1, pp=0), 6)
+        self.assertEqual(topo.get_rank(dp=1, mp=1, pp=1), 7)
+
+        # test get_coord
+        self.assertEqual(topo.get_coord(0), topo.coordinate(0, 0, 0))
+        self.assertEqual(topo.get_coord(1), topo.coordinate(0, 0, 1))
+        self.assertEqual(topo.get_coord(2), topo.coordinate(0, 1, 0))
+        self.assertEqual(topo.get_coord(3), topo.coordinate(0, 1, 1))
+        self.assertEqual(topo.get_coord(4), topo.coordinate(1, 0, 0))
+        self.assertEqual(topo.get_coord(5), topo.coordinate(1, 0, 1))
+        self.assertEqual(topo.get_coord(6), topo.coordinate(1, 1, 0))
+        self.assertEqual(topo.get_coord(7), topo.coordinate(1, 1, 1))
+
+        # test get_axis_list
+        self.assertEqual(topo.get_axis_list("dp", 0), [0, 1, 2, 3])
+        self.assertEqual(topo.get_axis_list("dp", 1), [4, 5, 6, 7])
+        self.assertEqual(topo.get_axis_list("mp", 0), [0, 1, 4, 5])
+        self.assertEqual(topo.get_axis_list("mp", 1), [2, 3, 6, 7])
+        self.assertEqual(topo.get_axis_list("pp", 0), [0, 2, 4, 6])
+        self.assertEqual(topo.get_axis_list("pp", 1), [1, 3, 5, 7])
+
+        # test get_dim_size
+        self.assertEqual(topo.get_dim_size("dp"), 2)
+        self.assertEqual(topo.get_dim_size("mp"), 2)
+        self.assertEqual(topo.get_dim_size("pp"), 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index ef2900be39c9ac3b3da445bfea89a7ec13752cbc..a56797971b51478ab3992e3dc2db85c86c21179a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -106,6 +106,20 @@ class TestAutoCast(unittest.TestCase):
 
         self.assertRaises(ValueError, func)
 
+    def test_amp_guard_upsupported_fp16_op(self):
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard(True):
+                out_fp16 = conv2d(data)
+                out_fp32 = paddle.expand_as(
+                    out_fp16, out_fp16)  # expand_as_v2 has no fp16 kernel
+
+        self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
+        self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
+        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+
 
 class TestAmpScaler(unittest.TestCase):
     def test_scale(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index cb48013902a532d5481aa0e2241e8c734986c2d8..1cdb57c540ac4dec982689c21137b945906666fe 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -296,6 +296,28 @@ class TestImperative(unittest.TestCase):
             self.assertTrue(tmp._grad_ivar() is None)
             self.assertTrue(l0.weight._grad_ivar() is not None)
 
+    def test_paddle_imperative_set_grad_enabled(self):
+        data = np.array([[2, 3], [4, 5]]).astype('float32')
+        with fluid.dygraph.guard():
+            l0 = fluid.Linear(2, 2)
+            self.assertTrue(l0.weight._grad_ivar() is None)
+            l1 = fluid.Linear(2, 2)
+            with paddle.set_grad_enabled(False):
+                self.assertTrue(l1.weight.stop_gradient is False)
+                tmp = l1.weight * 2
+                with paddle.set_grad_enabled(True):
+                    tmp2 = l1.weight * 2
+                self.assertTrue(tmp.stop_gradient)
+                self.assertTrue(tmp2.stop_gradient is False)
+            x = fluid.dygraph.to_variable(data)
+            y = l0(x) + tmp2
+            o = l1(y)
+            o.backward()
+
+            self.assertTrue(tmp._grad_ivar() is None)
+            self.assertTrue(tmp2._grad_ivar() is not None)
+            self.assertTrue(l0.weight._grad_ivar() is not None)
+
     def test_sum_op(self):
         x = np.ones([2, 2], np.float32)
         with fluid.dygraph.guard():
@@ -472,7 +494,7 @@ class TestImperative(unittest.TestCase):
         self.assertEqual("linear_1.b_0", params[3].name)
         self.assertEqual(len(params), 4)
 
-        sublayers = mlp.sublayers(True)
+        sublayers = mlp.sublayers()
         self.assertEqual(mlp._linear1, sublayers[0])
         self.assertEqual(mlp._linear2, sublayers[1])
         self.assertEqual(len(sublayers), 2)
@@ -484,15 +506,15 @@ class TestImperative(unittest.TestCase):
             for i in range(10):
                 y = paddle.pow(x, 4.0)
                 y.backward()
-                self.assertEqual(x.grad, (i + 1) * 500)
+                self.assertEqual(x.grad.numpy(), (i + 1) * 500)
             x.clear_gradient()
-            self.assertEqual(x.grad, 0.)
+            self.assertEqual(x.grad.numpy(), 0.)
             for i in range(10):
                 y = paddle.pow(x, 4.0)
                 y.backward()
-                self.assertEqual(x.grad, (i + 1) * 500)
+                self.assertEqual(x.grad.numpy(), (i + 1) * 500)
             x.clear_grad()
-            self.assertEqual(x.grad, 0.)
+            self.assertEqual(x.grad.numpy(), 0.)
 
         def test_simple_net(sort_sum_gradient):
             fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
@@ -505,9 +527,9 @@ class TestImperative(unittest.TestCase):
                 loss2 = x * z
                 loss1.backward(retain_graph=True)
                 loss2.backward(retain_graph=True)
-                self.assertTrue(np.array_equal(x.grad, [23.]))
-                self.assertTrue(np.array_equal(y.grad, [25.]))
-                self.assertTrue(np.array_equal(z.grad, [5.]))
+                self.assertTrue(np.array_equal(x.grad.numpy(), [23.]))
+                self.assertTrue(np.array_equal(y.grad.numpy(), [25.]))
+                self.assertTrue(np.array_equal(z.grad.numpy(), [5.]))
                 x.clear_grad()
                 y.clear_grad()
                 z.clear_grad()
@@ -520,13 +542,13 @@ class TestImperative(unittest.TestCase):
             loss = fun(x, y, z)
             loss.backward(retain_graph=True)
             # x.grad = 2*x*y + z + 2*y = 27 
-            self.assertTrue(np.array_equal(x.grad, [27]))
+            self.assertTrue(np.array_equal(x.grad.numpy(), [27]))
 
             loss.backward(retain_graph=True)
-            self.assertTrue(np.array_equal(x.grad, [54]))
+            self.assertTrue(np.array_equal(x.grad.numpy(), [54]))
 
             loss.backward()
-            self.assertTrue(np.array_equal(x.grad, [81]))
+            self.assertTrue(np.array_equal(x.grad.numpy(), [81]))
 
             with self.assertRaises(RuntimeError):
                 loss.backward()
@@ -536,8 +558,8 @@ class TestImperative(unittest.TestCase):
             dx = paddle.grad([loss1], x, create_graph=True)[0]
             loss = loss1 + loss2 + dx
             loss.backward()
-            self.assertTrue(np.array_equal(dx.grad, [1]))
-            self.assertTrue(np.array_equal(x.grad, [108]))
+            self.assertTrue(np.array_equal(dx.grad.numpy(), [1]))
+            self.assertTrue(np.array_equal(x.grad.numpy(), [108]))
 
         def test_mlp(sort_sum_gradient):
             fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
@@ -557,28 +579,34 @@ class TestImperative(unittest.TestCase):
                 detach_x = x.detach()
                 clear_loss = mlp2(detach_x)
                 clear_loss.backward()
-                expected_weight1_grad = expected_weight1_grad + mlp2._linear1.weight.grad
-                expected_bias1_grad = expected_bias1_grad + mlp2._linear1.bias.grad
-                expected_weight2_grad = expected_weight2_grad + mlp2._linear2.weight.grad
-                expected_bias2_grad = expected_bias2_grad + mlp2._linear2.bias.grad
+                expected_weight1_grad = (
+                    expected_weight1_grad + mlp2._linear1.weight.grad.numpy())
+                expected_bias1_grad = (
+                    expected_bias1_grad + mlp2._linear1.bias.grad.numpy())
+                expected_weight2_grad = (
+                    expected_weight2_grad + mlp2._linear2.weight.grad.numpy())
+                expected_bias2_grad = (
+                    expected_bias2_grad + mlp2._linear2.bias.grad.numpy())
 
                 loss = mlp1(x)
                 loss.backward()
 
-                self.assertTrue(np.array_equal(loss.grad, [1]))
+                self.assertTrue(np.array_equal(loss.grad.numpy(), [1]))
                 self.assertTrue(
-                    np.allclose(mlp1._linear1.weight.grad,
+                    np.allclose(mlp1._linear1.weight.grad.numpy(),
                                 expected_weight1_grad))
                 self.assertTrue(
-                    np.allclose(mlp1._linear1.bias.grad, expected_bias1_grad))
+                    np.allclose(mlp1._linear1.bias.grad.numpy(),
+                                expected_bias1_grad))
                 self.assertTrue(
-                    np.allclose(mlp1._linear2.weight.grad,
+                    np.allclose(mlp1._linear2.weight.grad.numpy(),
                                 expected_weight2_grad))
                 self.assertTrue(
-                    np.allclose(mlp1._linear2.bias.grad, expected_bias2_grad))
+                    np.allclose(mlp1._linear2.bias.grad.numpy(),
+                                expected_bias2_grad))
 
                 mlp2.clear_gradients()
-                self.assertTrue(np.array_equal(clear_loss.grad, [1]))
+                self.assertTrue(np.array_equal(clear_loss.grad.numpy(), [1]))
                 if ((batch_id + 1) % 10) == 0:
                     mlp1.clear_gradients()
                     expected_weight1_grad = 0.
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cd3c6a8fb554407699c95f9933cce24cd9426c2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+from collections import OrderedDict
+
+
+class TestLayerDict(unittest.TestCase):
+    def test_layer_dict(self):
+        layers = OrderedDict([
+            ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+            ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+        ])
+
+        layers_dicts = paddle.nn.LayerDict(sublayers=layers)
+
+        def check_layer_dict():
+            self.assertEqual(len(layers), len(layers_dicts))
+
+            for k1, k2 in zip(layers, layers_dicts):
+                self.assertIs(layers[k1], layers_dicts[k2])
+
+            for k, v in zip(layers, layers_dicts.children()):
+                self.assertIs(layers[k], v)
+
+            for k in layers_dicts:
+                self.assertIs(layers[k], layers_dicts[k])
+
+            for k in layers.keys():
+                self.assertTrue(k in layers_dicts)
+
+            for k1, k2 in zip(layers.keys(), layers_dicts.keys()):
+                self.assertEqual(k1, k2)
+
+            for k, v in layers_dicts.items():
+                self.assertIs(layers[k], v)
+
+            for v1, v2 in zip(layers.values(), layers_dicts.values()):
+                self.assertIs(v1, v2)
+
+        check_layer_dict()
+
+        layers['linear'] = paddle.nn.Linear(2, 4)
+        layers_dicts['linear'] = layers['linear']
+        check_layer_dict()
+
+        sublayer = OrderedDict([
+            ('sigmod', paddle.nn.Sigmoid()),
+            ('relu', paddle.nn.ReLU()),
+        ])
+        layers.update(sublayer)
+        layers_dicts.update(sublayer)
+        check_layer_dict()
+
+        del layers['conv1d']
+        del layers_dicts['conv1d']
+        check_layer_dict()
+
+        l = layers_dicts.pop('linear')
+        self.assertIs(layers['linear'], l)
+        layers.pop('linear')
+        check_layer_dict()
+
+        layers_dicts.clear()
+        self.assertEqual(0, len(layers_dicts))
+        layers.clear()
+        check_layer_dict()
+
+        list_format_layers = [
+            ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+            ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+        ]
+        layers = OrderedDict(list_format_layers)
+        layers_dicts.update(list_format_layers)
+        check_layer_dict()
+
+    def test_layer_dict_error_inputs(self):
+        layers = [
+            ('conv1d', paddle.nn.Conv1D(3, 2, 3), "conv1d"),
+            ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+        ]
+
+        layers_dicts = paddle.nn.LayerDict()
+        self.assertRaises(ValueError, layers_dicts.update, layers)
+
+        self.assertRaises(AssertionError, layers_dicts.update, 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
index ef90dd049869ad7a1c816b439bedcb52a6f33210..2e722b69c3ea028244884f6186c69e6fe92fcdb2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py
@@ -84,6 +84,18 @@ class TestImperativeContainer(unittest.TestCase):
             self.assertListEqual(res8.shape, [5, 3**3])
             res8.backward()
 
+            model4 = MyLayer(layerlist[:3])
+            model4.layerlist[-1] = fluid.dygraph.Linear(4, 5)
+            res9 = model4(x)
+            self.assertListEqual(res9.shape, [5, 5])
+            del model4.layerlist[-1]
+            res10 = model4(x)
+            self.assertListEqual(res10.shape, [5, 4])
+            model4.layerlist.insert(-1, fluid.dygraph.Linear(2, 2))
+            res11 = model4(x)
+            self.assertListEqual(res11.shape, [5, 4])
+            res11.backward()
+
     def test_layer_list(self):
         self.layer_list(True)
         self.layer_list(False)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py b/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
index 846c84c8a58b5c4c437270be525af2f0fa5608c2..972f1b64e1407129db84459fb1d4fd4640a9ab0d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
@@ -55,6 +55,41 @@ class TestImperativeContainerSequential(unittest.TestCase):
             loss2 = fluid.layers.reduce_mean(res2)
             loss2.backward()
 
+    def test_sequential_list_params(self):
+        data = np.random.uniform(-1, 1, [5, 10]).astype('float32')
+        with fluid.dygraph.guard():
+            data = fluid.dygraph.to_variable(data)
+            model1 = fluid.dygraph.Sequential(
+                fluid.Linear(10, 1), fluid.Linear(1, 2))
+            res1 = model1(data)
+            self.assertListEqual(res1.shape, [5, 2])
+            model1[1] = fluid.Linear(1, 3)
+            res1 = model1(data)
+            self.assertListEqual(res1.shape, [5, 3])
+            loss1 = fluid.layers.reduce_mean(res1)
+            loss1.backward()
+
+            l1 = fluid.Linear(10, 1)
+            l2 = fluid.Linear(1, 3)
+            model2 = fluid.dygraph.Sequential(['l1', l1], ['l2', l2])
+            self.assertEqual(len(model2), 2)
+            res2 = model2(data)
+            self.assertTrue(l1 is model2.l1)
+            self.assertListEqual(res2.shape, res1.shape)
+            self.assertEqual(len(model1.parameters()), len(model2.parameters()))
+            del model2['l2']
+            self.assertEqual(len(model2), 1)
+            res2 = model2(data)
+            self.assertListEqual(res2.shape, [5, 1])
+            model2.add_sublayer('l3', fluid.Linear(1, 3))
+            model2.add_sublayer('l4', fluid.Linear(3, 4))
+            self.assertEqual(len(model2), 3)
+            res2 = model2(data)
+            self.assertListEqual(res2.shape, [5, 4])
+
+            loss2 = fluid.layers.reduce_mean(res2)
+            loss2.backward()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layers.py b/python/paddle/fluid/tests/unittests/test_imperative_layers.py
index 214339c50d60dd626e9b7eaf931d24114e13705b..dc15566f85475c835fc4a5dec69099ff68fd9722 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layers.py
@@ -210,7 +210,8 @@ class TestLayerPrint(unittest.TestCase):
         module = nn.BatchNorm1D(1)
         self.assertEqual(
             str(module),
-            'BatchNorm1D(num_features=1, momentum=0.9, epsilon=1e-05)')
+            'BatchNorm1D(num_features=1, momentum=0.9, epsilon=1e-05, data_format=NCL)'
+        )
 
         module = nn.BatchNorm2D(1)
         self.assertEqual(
@@ -220,7 +221,8 @@ class TestLayerPrint(unittest.TestCase):
         module = nn.BatchNorm3D(1)
         self.assertEqual(
             str(module),
-            'BatchNorm3D(num_features=1, momentum=0.9, epsilon=1e-05)')
+            'BatchNorm3D(num_features=1, momentum=0.9, epsilon=1e-05, data_format=NCDHW)'
+        )
 
         module = nn.SyncBatchNorm(2)
         self.assertEqual(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index e7af249cf8bc42ad26ce3c3f151804a747866f0c..64f1715fc975f7bc39e837920dd493590727526d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -76,7 +76,10 @@ class SimpleNet(fluid.Layer):
 class TestDygraphSimpleNet(unittest.TestCase):
     def test_simple_net(self):
         for is_sparse in [True, False]:
-            for dtype in ["float32", "float64"]:
+            dtype_list = ["float32"]
+            if not core.is_compiled_with_rocm():
+                dtype_list.append("float64")
+            for dtype in dtype_list:
                 self.simple_net_float32(is_sparse, dtype)
 
     def simple_net_float32(self, is_sparse, dtype):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
index 721453c51242198230edf306b1fdf14ce857ff93..dfcd6392b46fb859facbd858f606d1dbe57b3a13 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
@@ -52,10 +52,6 @@ class TestImperativeNamedSubLayers(unittest.TestCase):
                                                            list_sublayers):
                 self.assertEqual(sublayer, expected_sublayer)
 
-            for name, sublayer in model.named_sublayers(
-                    include_sublayers=False):
-                self.assertEqual(model[name], sublayer)
-
             self.assertListEqual(
                 [l for _, l in list(model.named_sublayers(include_self=True))],
                 [model] + expected_sublayers)
@@ -71,7 +67,7 @@ class TestImperativeNamedParameters(unittest.TestCase):
 
             named_parameters = list(model.named_parameters())
             expected_named_parameters = list()
-            for prefix, layer in model.named_sublayers(include_sublayers=True):
+            for prefix, layer in model.named_sublayers():
                 for name, param in layer.named_parameters(
                         include_sublayers=False):
                     full_name = prefix + ('.' if prefix else '') + name
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index f256e97e83795933098a4c8349fbbbd2da816617..973c559857974a66ee1c06fc02305adf29c8f0ad 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -29,19 +29,19 @@ class Config(object):
     config for training
     '''
     # encoder rnn hidden_size
-    encoder_size = 16
+    encoder_size = 8
     # decoder size for decoder stage
-    decoder_size = 16
+    decoder_size = 8
     # size for word embedding
-    word_vector_dim = 16
+    word_vector_dim = 8
     # max length for label padding
-    max_length = 5
+    max_length = 3
     # optimizer setting
     LR = 1.0
     learning_rate_decay = None
 
     # batch size to train
-    batch_size = 8
+    batch_size = 2
     # class number to classify
     num_classes = 64
 
@@ -55,7 +55,7 @@ class Config(object):
     TRAIN_LIST_FILE_NAME = "train.list"
 
     # data shape for input image
-    DATA_SHAPE = [1, 48, 384]
+    DATA_SHAPE = [1, 16, 64]
 
 
 class ConvBNPool(fluid.dygraph.Layer):
@@ -124,13 +124,13 @@ class OCRConv(fluid.dygraph.Layer):
     def __init__(self, is_test=False, use_cudnn=True):
         super(OCRConv, self).__init__()
         self.conv_bn_pool_1 = ConvBNPool(
-            2, [16, 16], [1, 16], is_test=is_test, use_cudnn=use_cudnn)
+            2, [8, 8], [1, 8], is_test=is_test, use_cudnn=use_cudnn)
         self.conv_bn_pool_2 = ConvBNPool(
-            2, [32, 32], [16, 32], is_test=is_test, use_cudnn=use_cudnn)
+            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn)
         self.conv_bn_pool_3 = ConvBNPool(
-            2, [64, 64], [32, 64], is_test=is_test, use_cudnn=use_cudnn)
+            2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn)
         self.conv_bn_pool_4 = ConvBNPool(
-            2, [128, 128], [64, 128],
+            2, [16, 16], [8, 16],
             is_test=is_test,
             pool=False,
             use_cudnn=use_cudnn)
@@ -212,9 +212,9 @@ class EncoderNet(fluid.dygraph.Layer):
         self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn)
 
         self.fc_1_layer = Linear(
-            768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
+            32, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
         self.fc_2_layer = Linear(
-            768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
+            32, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False)
         self.gru_forward_layer = DynamicGRU(
             size=rnn_hidden_size,
             h_0=h_0,
@@ -241,10 +241,9 @@ class EncoderNet(fluid.dygraph.Layer):
 
         transpose_conv_features = fluid.layers.transpose(
             conv_features, perm=[0, 3, 1, 2])
-
         sliced_feature = fluid.layers.reshape(
             transpose_conv_features, [
-                -1, 48, transpose_conv_features.shape[2] *
+                -1, 8, transpose_conv_features.shape[2] *
                 transpose_conv_features.shape[3]
             ],
             inplace=False)
@@ -376,9 +375,9 @@ class TestDygraphOCRAttention(unittest.TestCase):
         seed = 90
         epoch_num = 1
         if core.is_compiled_with_cuda():
-            batch_num = 6
+            batch_num = 3
         else:
-            batch_num = 4
+            batch_num = 2
         np.random.seed = seed
         image_np = np.random.randn(Config.batch_size, Config.DATA_SHAPE[0],
                                    Config.DATA_SHAPE[1],
@@ -536,8 +535,9 @@ class TestDygraphOCRAttention(unittest.TestCase):
             self.assertTrue(np.array_equal(value, dy_param_init_value[key]))
 
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            self.assertTrue(np.allclose(value, dy_param_value[key], rtol=1e-05))
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index cd019c920756f34e6081da2d98e990d1bb5abee5..36c4d67bf2d813a5910eca7b0d766eee9c2a0824 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -190,10 +190,18 @@ class TestImperativeOptimizerBase(unittest.TestCase):
         for key, value in six.iteritems(static_param_init_value):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
 
-        self.assertTrue(np.allclose(static_out, dy_out))
+        if core.is_compiled_with_rocm():
+            self.assertTrue(np.allclose(static_out, dy_out, atol=1e-3))
+        else:
+            self.assertTrue(np.allclose(static_out, dy_out))
 
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            if core.is_compiled_with_rocm():
+                self.assertTrue(
+                    np.allclose(
+                        value, dy_param_value[key], atol=1e-3))
+            else:
+                self.assertTrue(np.allclose(value, dy_param_value[key]))
 
 
 class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index e3d82888f6160d42db6b83562618bcc857b44dc0..eac627d1b5b07656f107dd6e9fa667bb9cbb0249 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -207,10 +207,18 @@ class TestImperativeOptimizerBase(unittest.TestCase):
         for key, value in six.iteritems(static_param_init_value):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
 
-        self.assertTrue(np.allclose(static_out, dy_out))
+        if core.is_compiled_with_rocm():
+            self.assertTrue(np.allclose(static_out, dy_out, atol=1e-3))
+        else:
+            self.assertTrue(np.allclose(static_out, dy_out))
 
         for key, value in six.iteritems(static_param_value):
-            self.assertTrue(np.allclose(value, dy_param_value[key]))
+            if core.is_compiled_with_rocm():
+                self.assertTrue(
+                    np.allclose(
+                        value, dy_param_value[key], atol=1e-3))
+            else:
+                self.assertTrue(np.allclose(value, dy_param_value[key]))
 
 
 class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 672ffa9d394184e4ec14df8bded9b815dba0e186..9f0dcdb4d8f0c2e6da5baf31c970431ae261d698 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -930,7 +930,7 @@ class TestDygraphPtbRnn(unittest.TestCase):
             paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
             para_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy.pdparams'))
+                os.path.join('saved_dy', 'emb_dy.pdparams'), return_numpy=True)
             para_state_dict['weight'] = np.expand_dims(
                 para_state_dict['weight'], axis=-1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index 2f2a3e5de5ef9f63de36cccd15b0232f7a9a199f..8b2e61f8d2a04a2792d361b9db3f191134d74631 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -82,7 +82,10 @@ class SimpleNet(fluid.Layer):
 class TestDygraphSimpleNet(unittest.TestCase):
     def test_simple_net(self):
         for is_sparse in [True, False]:
-            for dtype in ["float32", "float64"]:
+            dtype_list = ["float32"]
+            if not core.is_compiled_with_rocm():
+                dtype_list.append("float64")
+            for dtype in dtype_list:
                 self.simple_net_float(is_sparse, dtype)
 
     def simple_net_float(self, is_sparse, dtype):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
index 2a74d29e1ee98f48cbea9d827c91eacf0208c334..645a05e75f6fbae8b84864e6cf29c884b92faea1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
@@ -75,10 +75,12 @@ class TestTracedLayerRecordNonPersistableInput(unittest.TestCase):
 
         self.assertEqual(actual_persistable_vars, expected_persistable_vars)
 
-        dirname = './traced_layer_test_non_persistable_vars'
-        traced_layer.save_inference_model(dirname=dirname)
-        filenames = set([f for f in os.listdir(dirname) if f != '__model__'])
-        self.assertEqual(filenames, expected_persistable_vars)
+        traced_layer.save_inference_model(
+            path='./traced_layer_test_non_persistable_vars')
+        self.assertTrue('traced_layer_test_non_persistable_vars.pdmodel' in
+                        os.listdir('./'))
+        self.assertTrue('traced_layer_test_non_persistable_vars.pdiparams' in
+                        os.listdir('./'))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 952265e1195f5d3138143a0933d494c5d97ff466..8ddb74989714ca38c4b8999abdec9a5e5674dbd9 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -29,7 +29,7 @@ DELTA = 0.00001
 def check_cast_op(op):
     return op.type == 'cast' and \
            op.attr('in_dtype') == VarDesc.VarType.FP32 and \
-           op.attr('out_dtype') == VarDesc.VarType.FP16
+           op.attr('out_dtype') in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]
 
 
 def output_hist(out):
@@ -87,6 +87,13 @@ class TestConstantInitializer(unittest.TestCase):
         block = self.test_constant_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_constant_initializer_bf16(self):
+        """Test constant initializer with bfloat16
+           No cast operator has been added here
+        """
+        self.test_constant_initializer_default_value("uint16")
+        self.test_constant_initializer("uint16")
+
 
 class TestUniformInitializer(unittest.TestCase):
     def test_uniform_initializer_default_value(self, dtype="float32"):
@@ -130,9 +137,9 @@ class TestUniformInitializer(unittest.TestCase):
                 name="param2",
                 initializer=initializer.UniformInitializer(seed=456))
         init_op = block.ops[1]
-        self.assertEqual(init_op.attr("seed"), 123)
+        self.assertEqual(init_op.attr("seed"), 456)
         init_op1 = block.ops[0]
-        self.assertEqual(init_op1.attr("seed"), 456)
+        self.assertEqual(init_op1.attr("seed"), 123)
 
     def test_uniform_initializer(self, dtype="float32"):
         """Test uniform initializer with supplied attributes
@@ -186,6 +193,14 @@ class TestUniformInitializer(unittest.TestCase):
         block = self.test_uniform_initializer_two_op("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_uniform_initializer_bf16(self):
+        """Test uniform initializer with bfloat16
+           No cast operator has been added here
+        """
+        block = self.test_uniform_initializer_default_value("uint16")
+        block = self.test_uniform_initializer(dtype="uint16")
+        block = self.test_uniform_initializer_two_op("uint16")
+
 
 class TestNormalInitializer(unittest.TestCase):
     def test_normal_initializer_default_value(self):
@@ -219,7 +234,7 @@ class TestNormalInitializer(unittest.TestCase):
                 lod_level=0,
                 name="param",
                 initializer=initializer.NormalInitializer(2.3, 1.9, 123))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -234,6 +249,12 @@ class TestNormalInitializer(unittest.TestCase):
         block = self.test_normal_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_normal_initializer_bf16(self):
+        """Test normal initializer with bfloat16
+        """
+        block = self.test_normal_initializer("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
 
 class TestXavierInitializer(unittest.TestCase):
     def test_uniform_xavier_initializer(self):
@@ -324,7 +345,9 @@ class TestXavierInitializer(unittest.TestCase):
         self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
 
-    def test_xavier_initializer_supplied_arguments(self, dtype="float32"):
+    def test_xavier_initializer_supplied_arguments(self,
+                                                   dtype="float32",
+                                                   uniform=True):
         """Test the Xavier initializer with supplied arguments
         """
         program = framework.Program()
@@ -336,14 +359,18 @@ class TestXavierInitializer(unittest.TestCase):
                 lod_level=0,
                 name="param",
                 initializer=initializer.XavierInitializer(
-                    fan_in=12, fan_out=23, seed=134))
-        num_ops = 2 if dtype == "float16" else 1
+                    uniform=uniform, fan_in=12, fan_out=23, seed=134))
+        num_ops = 2 if (dtype == "float16" or (dtype == "uint16" and
+                                               not uniform)) else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        limit = np.sqrt(6.0 / (12 + 23))
-        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        if uniform:
+            self.assertEqual(init_op.type, 'uniform_random')
+            limit = np.sqrt(6.0 / (12 + 23))
+            self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+            self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        else:
+            self.assertEqual(init_op.type, 'gaussian_random')
         self.assertEqual(init_op.attr('seed'), 134)
         return block
 
@@ -353,6 +380,16 @@ class TestXavierInitializer(unittest.TestCase):
         block = self.test_xavier_initializer_supplied_arguments("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_xavier_initializer_bf16(self):
+        """Test the Xavier initializer with bfloat16
+        """
+        block_uniform = self.test_xavier_initializer_supplied_arguments(
+            "uint16")
+        self.assertEqual(len(block_uniform.ops), 1)
+        block_gaussian = self.test_xavier_initializer_supplied_arguments(
+            "uint16", False)
+        self.assertTrue(check_cast_op(block_gaussian.ops[1]))
+
 
 class TestMSRAInitializer(unittest.TestCase):
     def test_uniform_msra_initializer(self):
@@ -470,6 +507,11 @@ class TestMSRAInitializer(unittest.TestCase):
         block = self.test_msra_initializer_supplied_arguments("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_msra_initializer_bf16(self):
+        """Test the MSRA initializer with bfloat16
+        """
+        block = self.test_msra_initializer_supplied_arguments("uint16")
+
 
 class TestBilinearInitializer(unittest.TestCase):
     def test_bilinear_initializer(self, dtype="float32"):
@@ -484,7 +526,7 @@ class TestBilinearInitializer(unittest.TestCase):
                 lod_level=0,
                 name="param",
                 initializer=initializer.BilinearInitializer())
-        num_ops = 2 if dtype == "float16" or dtype == "float64" else 1
+        num_ops = 2 if dtype in ["float16", "uint16", "float64"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
@@ -499,6 +541,12 @@ class TestBilinearInitializer(unittest.TestCase):
         block = self.test_bilinear_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_bilinear_initializer_bf16(self):
+        """Test the bilinear initializer with supplied arguments
+        """
+        block = self.test_bilinear_initializer("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_type_error(self):
         self.assertRaises(TypeError, self.test_bilinear_initializer, 'int32')
 
@@ -518,7 +566,7 @@ class TestNumpyArrayInitializer(unittest.TestCase):
                 lod_level=0,
                 name="param",
                 initializer=initializer.NumpyArrayInitializer(np_array))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
@@ -531,6 +579,12 @@ class TestNumpyArrayInitializer(unittest.TestCase):
         block = self.test_numpy_array_initializer("float16")
         self.assertTrue(block.ops[1])
 
+    def test_numpy_array_initializer_bf16(self):
+        """Test the numpy array initializer with bfloat16
+        """
+        block = self.test_numpy_array_initializer("uint16")
+        self.assertTrue(block.ops[1])
+
 
 class TestSetGlobalInitializer(unittest.TestCase):
     def test_set_global_weight_initilizer(self):
@@ -547,12 +601,12 @@ class TestSetGlobalInitializer(unittest.TestCase):
         block = startup_prog.global_block()
         self.assertEqual(len(block.ops), 2)
 
-        # init bias is the first op, and weight is the second
-        bias_init_op = block.ops[0]
+        # init weight is the first op, and bias is the second
+        bias_init_op = block.ops[1]
         self.assertEqual(bias_init_op.type, 'fill_constant')
         self.assertAlmostEqual(bias_init_op.attr('value'), 0.0, delta=DELTA)
 
-        param_init_op = block.ops[1]
+        param_init_op = block.ops[0]
         self.assertEqual(param_init_op.type, 'uniform_random')
         self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
         self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
@@ -577,14 +631,14 @@ class TestSetGlobalInitializer(unittest.TestCase):
         block = startup_prog.global_block()
         self.assertEqual(len(block.ops), 2)
 
-        # init bias is the first op, and weight is the second
-        bias_init_op = block.ops[0]
+        # init weight is the first op, and bias is the second
+        bias_init_op = block.ops[1]
         self.assertEqual(bias_init_op.type, 'gaussian_random')
         self.assertAlmostEqual(bias_init_op.attr('mean'), 0.0, delta=DELTA)
         self.assertAlmostEqual(bias_init_op.attr('std'), 2.0, delta=DELTA)
         self.assertEqual(bias_init_op.attr('seed'), 0)
 
-        param_init_op = block.ops[1]
+        param_init_op = block.ops[0]
         self.assertEqual(param_init_op.type, 'uniform_random')
         self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
         self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
@@ -618,5 +672,49 @@ class TestUniformInitializerDygraph(unittest.TestCase):
         paddle.enable_static()
 
 
+class TesetconsistencyOfDynamicAndStaticGraph(unittest.TestCase):
+    def test_order(self):
+        paddle.set_device('cpu')
+        SEED = 123
+        weight_attr = paddle.framework.ParamAttr(
+            name="linear_weight",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.nn.initializer.TruncatedNormal(
+                mean=0.0, std=2.0))
+        bias_attr = paddle.framework.ParamAttr(
+            name="linear_bias",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.nn.initializer.TruncatedNormal(
+                mean=0.0, std=2.0))
+
+        def run_dynamic_graph():
+            paddle.disable_static()
+            paddle.seed(SEED)
+            linear = paddle.nn.Linear(
+                1, 1, weight_attr=weight_attr, bias_attr=bias_attr)
+            return linear.weight.numpy(), linear.bias.numpy()
+            paddle.enable_static()
+
+        def run_static_graph():
+            paddle.enable_static()
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            paddle.seed(SEED)
+            linear = paddle.nn.Linear(
+                1, 1, weight_attr=weight_attr, bias_attr=bias_attr)
+            res = exe.run(paddle.static.default_startup_program(),
+                          fetch_list=['linear_weight', 'linear_bias'])
+            return res[0], res[1]
+
+        dynamic_res = run_dynamic_graph()
+        static_res = run_static_graph()
+
+        self.assertTrue(np.array_equal(dynamic_res[0], static_res[0]))
+        self.assertTrue(np.array_equal(dynamic_res[1], static_res[1]))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
index ce72b5effbc514345020e77a1b870224e2d3ac00..85815c5eeef30de825344c213b24d64b35c05a64 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -36,7 +36,7 @@ def get_uniform_min_and_max(weight):
 def check_cast_op(op):
     return op.type == 'cast' and \
            op.attr('in_dtype') == VarDesc.VarType.FP32 and \
-           op.attr('out_dtype') == VarDesc.VarType.FP16
+           op.attr('out_dtype') in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]
 
 
 class TestConstantInitializer(unittest.TestCase):
@@ -54,7 +54,7 @@ class TestConstantInitializer(unittest.TestCase):
                 lod_level=0,
                 name="param",
                 initializer=init_inst)
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -109,6 +109,13 @@ class TestConstantInitializer(unittest.TestCase):
         self.test_constant_initializer_default_value_dygraph("float16")
         self.test_constant_initializer_dygraph("float16")
 
+    def test_constant_initializer_bf16(self):
+        """Test constant initializer with bfloat16
+            No cast operator has been added here
+        """
+        self.test_constant_initializer_default_value_static("uint16")  #bfloat16
+        self.test_constant_initializer_static("uint16")  #bfloat16
+
 
 class TestKaimingInitializer(unittest.TestCase):
     def static_test_kaiming_initializer_common(self,
@@ -332,6 +339,13 @@ class TestUniform(unittest.TestCase):
         block = self.test_uniform_initializer_two_op("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_uniform_initializer_bf16(self):
+        """Test uniform initializer with bfloat16
+        """
+        block = self.test_uniform_initializer_default_value("uint16")  #bfloat16
+        block = self.test_uniform_initializer(dtype="uint16")  #bfloat16
+        block = self.test_uniform_initializer_two_op("uint16")  #bfloat16
+
     def test_uniform_initializer_dygraph(self):
         """Test uniform initializer in dygraph model.
         """
@@ -388,7 +402,7 @@ class TestNormal(unittest.TestCase):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Normal(2.3, 1.9))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -405,6 +419,12 @@ class TestNormal(unittest.TestCase):
         block = self.test_normal_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_normal_initializer_bf16(self):
+        """Test normal initializer with bfloat16
+        """
+        block = self.test_normal_initializer("uint16")  #bfloat16
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_normal_initializer_dygraph(self):
         """Test normal initializer in dygraph model.
         """
@@ -455,7 +475,7 @@ class TestTruncatedNormal(unittest.TestCase):
                 lod_level=0,
                 name="param",
                 initializer=initializer.TruncatedNormal(2.3, 1.9))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'truncated_gaussian_random')
@@ -474,6 +494,14 @@ class TestTruncatedNormal(unittest.TestCase):
         block = self.test_truncated_normal_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_truncated_normal_initializer_bf16(self):
+        """Test truncated normal initializer with bfloat16
+        """
+        paddle.enable_static()
+
+        block = self.test_truncated_normal_initializer("uint16")  #bfloat16
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_truncated_normal_initializer_dygraph(self):
         """Test truncated normal initializer in dygraph model.
         """
@@ -629,7 +657,7 @@ class TestAssign(unittest.TestCase):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Assign(np_array))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
@@ -645,6 +673,12 @@ class TestAssign(unittest.TestCase):
         block = self.test_assign_initializer("float16")
         self.assertTrue(block.ops[1])
 
+    def test_assign_initializer_bf16(self):
+        """Test the numpy array initializer with bfloat16
+        """
+        block = self.test_assign_initializer("uint16")  #bfloat16
+        self.assertTrue(block.ops[1])
+
     def test_assign_initializer_dygraph_1(self):
         """Test assign initializer in dygraph model.
         """
@@ -681,6 +715,18 @@ class TestAssign(unittest.TestCase):
 
         self.assertTrue((linear_3.weight.numpy() == [2.0, 2.0]).all(), '')
 
+    def test_assign_initializer_dygraph_4(self):
+        """Test assign initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr_4 = paddle.framework.ParamAttr(
+            name="linear_weight_4",
+            initializer=paddle.nn.initializer.Assign((2, 2)))
+        linear_4 = paddle.nn.Linear(2, 2, weight_attr=weight_attr_4)
+
+        self.assertTrue((linear_4.weight.numpy() == [2.0, 2.0]).all(), '')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace.py b/python/paddle/fluid/tests/unittests/test_inplace.py
index 2c6507c486e8729a8cf9fa1259e9f9d8102b8274..3d158763527e715885d99cb8cdb15920aecf2ce4 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace.py
@@ -98,11 +98,15 @@ class TestInplace(unittest.TestCase):
 class TestDygraphInplace(unittest.TestCase):
     def setUp(self):
         self.init_data()
+        self.set_np_compare_func()
 
     def init_data(self):
-        self.input_var_numpy = np.random.rand(2, 3, 1)
+        self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1])
         self.dtype = "float32"
 
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
     def non_inplace_api_processing(self, var):
         return paddle.squeeze(var)
 
@@ -177,7 +181,7 @@ class TestDygraphInplace(unittest.TestCase):
             var_d = var_c**2
             loss = var_d.sum()
             loss.backward()
-            grad_var_a_inplace = var_a.grad
+            grad_var_a_inplace = var_a.grad.numpy()
 
         with paddle.fluid.dygraph.guard():
             var_a = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
@@ -188,9 +192,9 @@ class TestDygraphInplace(unittest.TestCase):
             var_d = var_c**2
             loss = var_d.sum()
             loss.backward()
-            grad_var_a = var_a.grad
+            grad_var_a = var_a.grad.numpy()
 
-        self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
+        self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a))
 
     def test_backward_success_2(self):
         # Although var_b is modified inplace after using it, it does not used in gradient computation.
@@ -209,7 +213,7 @@ class TestDygraphInplace(unittest.TestCase):
             loss = var_d.sum()
 
             loss.backward()
-            grad_var_a_inplace = var_a.grad
+            grad_var_a_inplace = var_a.grad.numpy()
 
         with paddle.fluid.dygraph.guard():
             var_a = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
@@ -224,7 +228,7 @@ class TestDygraphInplace(unittest.TestCase):
             loss = var_d.sum()
 
             loss.backward()
-            grad_var_a = var_a.grad
+            grad_var_a = var_a.grad.numpy()
         self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
 
 
@@ -244,6 +248,14 @@ class TestDygraphInplaceReshape(TestDygraphInplace):
         return paddle.reshape_(var, [-1])
 
 
+class TestDygraphInplaceFlatten(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.flatten()
+
+    def inplace_api_processing(self, var):
+        return var.flatten_()
+
+
 class TestDygraphInplaceScatter(TestDygraphInplace):
     def init_data(self):
         self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
@@ -296,5 +308,106 @@ class TestDygraphInplaceTanh(TestDygraphInplace):
         return paddle.tanh_(var)
 
 
+class TestDygraphInplaceCeil(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.ceil()
+
+    def inplace_api_processing(self, var):
+        return var.ceil_()
+
+
+class TestDygraphInplaceFloor(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.floor()
+
+    def inplace_api_processing(self, var):
+        return var.floor_()
+
+
+class TestDygraphInplaceExp(TestDygraphInplace):
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def non_inplace_api_processing(self, var):
+        return var.exp()
+
+    def inplace_api_processing(self, var):
+        return var.exp_()
+
+
+class TestDygraphInplaceReciprocal(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.reciprocal()
+
+    def inplace_api_processing(self, var):
+        return var.reciprocal_()
+
+
+class TestDygraphInplaceRound(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.round()
+
+    def inplace_api_processing(self, var):
+        return var.round_()
+
+
+class TestDygraphInplaceSqrt(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        return var.sqrt()
+
+    def inplace_api_processing(self, var):
+        return var.sqrt_()
+
+
+class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt):
+    def non_inplace_api_processing(self, var):
+        return var.rsqrt()
+
+    def inplace_api_processing(self, var):
+        return var.rsqrt_()
+
+
+class TestDygraphInplaceClip(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.clip(0.6, 1.5)
+
+    def inplace_api_processing(self, var):
+        return var.clip_(0.6, 1.5)
+
+
+class TestDygraphInplaceScale(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.scale(scale=2.0, bias=3.0)
+
+    def inplace_api_processing(self, var):
+        return var.scale_(scale=2.0, bias=3.0)
+
+
+class TestDygraphInplaceAdd(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.rand(2, 3, 4)
+        self.dtype = "float32"
+        input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.input_var_2 = paddle.to_tensor(input_var_numpy_2)
+
+    def non_inplace_api_processing(self, var):
+        return var.add(self.input_var_2)
+
+    def inplace_api_processing(self, var):
+        return var.add_(self.input_var_2)
+
+
+class TestDygraphInplaceSubtract(TestDygraphInplaceAdd):
+    def non_inplace_api_processing(self, var):
+        return var.subtract(self.input_var_2)
+
+    def inplace_api_processing(self, var):
+        return var.subtract_(self.input_var_2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
index 7b92f6f02c6db20e50ce2e653b424a44c3db0dc3..077496200d988fafc67bb6f85892adc99c170daf 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -27,7 +27,7 @@ import paddle.fluid.unique_name as unique_name
 
 class TestInplaceANBOpTraining(unittest.TestCase):
     def setUp(self):
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.N = 4
         self.C = 5
         self.H = 7
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
new file mode 100644
index 0000000000000000000000000000000000000000..abc8849b614f73d80d492ec7aa08b01318d31462
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
@@ -0,0 +1,281 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.static import Program, program_guard
+
+
+# In static mode, inplace strategy will not be used in Inplace APIs.
+class TestStaticAutoGeneratedAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.init_data()
+        self.set_np_compare_func()
+
+    def init_data(self):
+        self.dtype = 'float32'
+        self.shape = [10, 20]
+        self.np_x = np.random.uniform(-5, 5, self.shape).astype(self.dtype)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
+    def executed_paddle_api(self, x):
+        return x.ceil()
+
+    def executed_numpy_api(self, x):
+        return np.ceil(x)
+
+    def test_api(self):
+        main_prog = Program()
+        with program_guard(main_prog, Program()):
+            x = paddle.static.data(name="x", shape=self.shape, dtype=self.dtype)
+            out = self.executed_paddle_api(x)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+        fetch_x, fetch_out = exe.run(main_prog,
+                                     feed={"x": self.np_x},
+                                     fetch_list=[x, out])
+
+        self.assertTrue(np.array_equal(fetch_x, self.np_x))
+        self.assertTrue(
+            self.np_compare(fetch_out, self.executed_numpy_api(self.np_x)))
+
+
+class TestStaticInplaceAutoGeneratedAPI(TestStaticAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.ceil_()
+
+
+class TestStaticFloorAPI(TestStaticAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.floor()
+
+    def executed_numpy_api(self, x):
+        return np.floor(x)
+
+
+class TestStaticInplaceFloorAPI(TestStaticFloorAPI):
+    def executed_paddle_api(self, x):
+        return x.floor_()
+
+
+class TestStaticExpAPI(TestStaticAutoGeneratedAPI):
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def executed_paddle_api(self, x):
+        return x.exp()
+
+    def executed_numpy_api(self, x):
+        return np.exp(x)
+
+
+class TestStaticInplaceExpAPI(TestStaticExpAPI):
+    def executed_paddle_api(self, x):
+        return x.exp_()
+
+
+class TestStaticReciprocalAPI(TestStaticAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.reciprocal()
+
+    def executed_numpy_api(self, x):
+        return np.reciprocal(x)
+
+
+class TestStaticInplaceReciprocalAPI(TestStaticReciprocalAPI):
+    def executed_paddle_api(self, x):
+        return x.reciprocal_()
+
+
+class TestStaticRoundAPI(TestStaticAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.round()
+
+    def executed_numpy_api(self, x):
+        return np.round(x)
+
+
+class TestStaticInplaceRoundAPI(TestStaticRoundAPI):
+    def executed_paddle_api(self, x):
+        return x.round_()
+
+
+class TestStaticSqrtAPI(TestStaticAutoGeneratedAPI):
+    def init_data(self):
+        self.dtype = 'float32'
+        self.shape = [10, 20]
+        self.np_x = np.random.uniform(0, 5, self.shape).astype(self.dtype)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def executed_paddle_api(self, x):
+        return x.sqrt()
+
+    def executed_numpy_api(self, x):
+        return np.sqrt(x)
+
+
+class TestStaticInplaceSqrtAPI(TestStaticSqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.sqrt_()
+
+
+class TestStaticRsqrtAPI(TestStaticSqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.rsqrt()
+
+    def executed_numpy_api(self, x):
+        return 1 / np.sqrt(x)
+
+
+class TestStaticInplaceRsqrtAPI(TestStaticRsqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.rsqrt_()
+
+
+# In dygraph mode, inplace strategy will be used in Inplace APIs.
+class TestDygraphAutoGeneratedAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.init_data()
+        self.set_np_compare_func()
+
+    def init_data(self):
+        self.dtype = 'float32'
+        self.shape = [10, 20]
+        self.np_x = np.random.uniform(-5, 5, self.shape).astype(self.dtype)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
+    def executed_paddle_api(self, x):
+        return x.ceil()
+
+    def executed_numpy_api(self, x):
+        return np.ceil(x)
+
+    def test_api(self):
+        x = paddle.to_tensor(self.np_x, dtype=self.dtype)
+        out = self.executed_paddle_api(x)
+
+        self.assertTrue(
+            self.np_compare(out.numpy(), self.executed_numpy_api(self.np_x)))
+
+
+class TestDygraphInplaceAutoGeneratedAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.ceil_()
+
+
+class TestDygraphFloorAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.floor()
+
+    def executed_numpy_api(self, x):
+        return np.floor(x)
+
+
+class TestDygraphInplaceFloorAPI(TestDygraphFloorAPI):
+    def executed_paddle_api(self, x):
+        return x.floor_()
+
+
+class TestDygraphExpAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.exp()
+
+    def executed_numpy_api(self, x):
+        return np.exp(x)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+
+class TestDygraphInplaceExpAPI(TestDygraphExpAPI):
+    def executed_paddle_api(self, x):
+        return x.exp_()
+
+
+class TestDygraphReciprocalAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.reciprocal()
+
+    def executed_numpy_api(self, x):
+        return np.reciprocal(x)
+
+
+class TestDygraphInplaceReciprocalAPI(TestDygraphReciprocalAPI):
+    def executed_paddle_api(self, x):
+        return x.reciprocal_()
+
+
+class TestDygraphRoundAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.round()
+
+    def executed_numpy_api(self, x):
+        return np.round(x)
+
+
+class TestDygraphInplaceRoundAPI(TestDygraphRoundAPI):
+    def executed_paddle_api(self, x):
+        return x.round_()
+
+
+class TestDygraphSqrtAPI(TestDygraphAutoGeneratedAPI):
+    def init_data(self):
+        self.dtype = 'float32'
+        self.shape = [10, 20]
+        self.np_x = np.random.uniform(0, 100, self.shape).astype(self.dtype)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def executed_paddle_api(self, x):
+        return x.sqrt()
+
+    def executed_numpy_api(self, x):
+        return np.sqrt(x)
+
+
+class TestDygraphInplaceSqrtAPI(TestDygraphSqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.sqrt_()
+
+
+class TestDygraphRsqrtAPI(TestDygraphSqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.rsqrt()
+
+    def executed_numpy_api(self, x):
+        return 1. / np.sqrt(x)
+
+
+class TestDygraphInplaceRsqrtAPI(TestDygraphRsqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.rsqrt_()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index bf9912c89cb8736c17516d4498535f157fb2c914..eef38182f6edf69280b0eafd8e3d0794dc0e5f12 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -158,6 +158,22 @@ class LinearNetMultiInput(fluid.dygraph.Layer):
         return x_out, y_out, loss
 
 
+class LinearNetMultiInput1(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetMultiInput1, self).__init__()
+        self._linear1 = Linear(in_size, out_size)
+        self._linear2 = Linear(in_size, out_size)
+
+    @declarative(input_spec=(InputSpec(
+        [None, 8], dtype='float32'), InputSpec(
+            [None, 8], dtype='float32')))
+    def forward(self, x, y):
+        x_out = self._linear1(x)
+        y_out = self._linear2(y)
+        loss = fluid.layers.mean(x_out + y_out)
+        return x_out, y_out, loss
+
+
 class MultiLoadingLinearNet(fluid.dygraph.Layer):
     def __init__(self, size, model_path):
         super(MultiLoadingLinearNet, self).__init__()
@@ -383,15 +399,6 @@ class TestJitSaveLoad(unittest.TestCase):
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
-    def test_jit_load_model_incomplete(self):
-        model_path = "test_jit_save_load.remove_variables/model"
-        self.train_and_save_model(model_path)
-        # remove `.pdiparams`	
-        var_path = model_path + INFER_PARAMS_SUFFIX
-        os.remove(var_path)
-        with self.assertRaises(ValueError):
-            paddle.jit.load(model_path)
-
     def test_jit_load_no_path(self):
         path = "test_jit_save_load.no_path/model_path"
         with self.assertRaises(ValueError):
@@ -542,6 +549,42 @@ class TestSaveLoadWithInputSpec(unittest.TestCase):
         # 4. assert pred_x == pred_xx
         self.assertTrue(np.allclose(pred_x.numpy(), pred_xx.numpy()))
 
+    def test_multi_in_out1(self):
+        net = LinearNetMultiInput1(8, 8)
+
+        model_path = "multi_inout1.output_spec1/model"
+        # 1. check inputs and outputs
+        self.assertTrue(len(net.forward.inputs) == 2)
+        input_x = net.forward.inputs[0]
+        input_y = net.forward.inputs[1]
+        self.assertTrue(input_x.shape == (-1, 8))
+        self.assertTrue(input_y.shape == (-1, 8))
+
+        # 2. prune loss
+        output_spec = net.forward.outputs[:2]
+        paddle.jit.save(net, model_path, output_spec=output_spec)
+
+        # 3. load to infer
+        infer_layer = paddle.jit.load(model_path)
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        y = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        # 4. predict
+        pred_x, pred_y = infer_layer(x, y)
+
+        # 1. prune y and loss
+        model_path = "multi_inout1.output_spec2/model"
+        output_spec = net.forward.outputs[:1]
+        paddle.jit.save(net, model_path, (input_x, ), output_spec=output_spec)
+        # 2. load again
+        infer_layer2 = paddle.jit.load(model_path)
+        # 3. predict
+        pred_xx = infer_layer2(x)
+
+        # 4. assert pred_x == pred_xx
+        self.assertTrue(np.allclose(pred_x.numpy(), pred_xx.numpy()))
+
 
 class TestJitSaveLoadConfig(unittest.TestCase):
     def setUp(self):
@@ -1112,6 +1155,63 @@ class TestJitSaveLoadFinetuneLoad(unittest.TestCase):
         self.assertTrue(float(((result_01 - result_11)).abs().max()) < 1e-5)
 
 
+class TestJitSaveLoadFunction(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+    def test_jit_save_load_static_function(self):
+        @paddle.jit.to_static
+        def fun(inputs):
+            return paddle.tanh(inputs)
+
+        path = 'test_jit_save_load_function_1/func'
+        inps = paddle.rand([3, 6])
+        origin = fun(inps)
+
+        paddle.jit.save(fun, path)
+        load_func = paddle.jit.load(path)
+
+        load_result = load_func(inps)
+        self.assertTrue((load_result - origin).abs().max() < 1e-10)
+
+    def test_jit_save_load_function_input_spec(self):
+        @paddle.jit.to_static(input_spec=[
+            InputSpec(
+                shape=[None, 6], dtype='float32', name='x'),
+        ])
+        def fun(inputs):
+            return paddle.nn.functional.relu(inputs)
+
+        path = 'test_jit_save_load_function_2/func'
+        inps = paddle.rand([3, 6])
+        origin = fun(inps)
+
+        paddle.jit.save(fun, path)
+        load_func = paddle.jit.load(path)
+        load_result = load_func(inps)
+        self.assertTrue((load_result - origin).abs().max() < 1e-10)
+
+    def test_jit_save_load_function_function(self):
+        def fun(inputs):
+            return paddle.tanh(inputs)
+
+        path = 'test_jit_save_load_function_3/func'
+        inps = paddle.rand([3, 6])
+        origin = fun(inps)
+
+        paddle.jit.save(
+            fun,
+            path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 6], dtype='float32', name='x'),
+            ])
+        load_func = paddle.jit.load(path)
+
+        load_result = load_func(inps)
+        self.assertTrue((load_result - origin).abs().max() < 1e-10)
+
+
 class TestJitSaveLoadDataParallel(unittest.TestCase):
     def verify_inference_correctness(self, layer, path):
         layer.eval()
diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py
index fba16959901a888d5868b38517d12d114993a561..c35188623b4400b426f7bac96c252c34b15f73b0 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -44,8 +44,10 @@ class TestFunctionalL1Loss(unittest.TestCase):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(
+            name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=[10, 10, 5], dtype='float32')
         result0 = paddle.nn.functional.l1_loss(input, label)
         result1 = paddle.nn.functional.l1_loss(input, label, reduction='sum')
         result2 = paddle.nn.functional.l1_loss(input, label, reduction='none')
@@ -127,8 +129,10 @@ class TestClassL1Loss(unittest.TestCase):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(
+            name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=[10, 10, 5], dtype='float32')
         l1_loss = paddle.nn.loss.L1Loss()
         result0 = l1_loss(input, label)
         l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
index f324e4bd377c616fb14b2b6df2b936b04ed76ff5..77cd6926b563da69b33c5d52a7064137f5487ba0 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -82,5 +82,60 @@ class TestDygraphLayerNormv2(unittest.TestCase):
             self.assertTrue(np.allclose(y1, y2))
 
 
+class TestLayerNormFunction(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v0(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    x = fluid.dygraph.to_variable(x)
+                    y = paddle.nn.functional.layer_norm(x, shape[1:])
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    x = fluid.dygraph.to_variable(x)
+                    y = paddle.nn.functional.layer_norm(x, tuple(shape[1:]))
+                return y.numpy()
+
+            def compute_v3(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[-1])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v4(x):
+                with fluid.dygraph.guard(p):
+                    x = fluid.dygraph.to_variable(x)
+                    y = paddle.nn.functional.layer_norm(x, shape[-1])
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y0 = compute_v0(x)
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y0, y1))
+            self.assertTrue(np.allclose(y0, y2))
+            y3 = compute_v3(x)
+            y4 = compute_v4(x)
+            self.assertTrue(np.allclose(y3, y4))
+
+            self.assertRaises(
+                ValueError,
+                paddle.nn.functional.layer_norm,
+                x=x,
+                normalized_shape=1.0)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 35ecbd6bf10c30823e0faa93a67906bcaa597b98..5da4a1889b6b43b45673baae3b877ff3344bbb7d 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3718,6 +3718,36 @@ class TestLayerTrainingAttribute(unittest.TestCase):
         self.assertFalse(net.training)
 
 
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self._linear = paddle.nn.Linear(1, 1)
+        self._dropout = paddle.nn.Dropout(p=0.5)
+
+    def forward(self, input):
+        temp = self._linear(input)
+        temp = self._dropout(temp)
+        return temp
+
+
+class MySuperLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MySuperLayer, self).__init__()
+        self._mylayer = MyLayer()
+
+    def forward(self, input):
+        temp = self._mylayer(input)
+        return temp
+
+
+class TestSubLayerCount(unittest.TestCase):
+    def test_sublayer(self):
+        with fluid.dygraph.guard():
+            mySuperlayer = MySuperLayer()
+            self.assertTrue(len(mySuperlayer.sublayers()) == 3)
+            self.assertTrue(len(mySuperlayer.sublayers(include_self=True)) == 4)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
index bee230fba5a7e225e55103ffdedabdfc4d525628..d9d64e4dfa693a49ff66d5aa822052449ac4751c 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 unset https_proxy http_proxy
 
 nohup python -u test_listen_and_serv_op.py > test_listen_and_serv_op.log 2>&1 &
diff --git a/python/paddle/fluid/tests/unittests/test_lookahead.py b/python/paddle/fluid/tests/unittests/test_lookahead.py
index 98349be93db1a8ec8597cda2dfaeaf84b71a9741..a4b5e6d0d9576f5d5026df83584b0ccc8aea4f5c 100644
--- a/python/paddle/fluid/tests/unittests/test_lookahead.py
+++ b/python/paddle/fluid/tests/unittests/test_lookahead.py
@@ -110,7 +110,8 @@ class TestLookAhead(unittest.TestCase):
                     out = layer(image)
                     loss = loss_fn(out, label)
                     loss.backward()
-                    fast_param = layer.bias.numpy() - SGD_LR * layer.bias.grad
+                    fast_param = (
+                        layer.bias.numpy() - SGD_LR * layer.bias.grad.numpy())
                     opt.step()
                     if idx == 1:
                         slow_param = fast_param
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b423123160f0b5adc6f1c2efbf3ae56ec2f62bbd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
@@ -0,0 +1,222 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float,
+    skip_check_grad_ci)
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle import enable_static
+
+
+def _lookup(weights, ids, flat_ids):
+    w_shape = weights.shape
+    out_shape = list(ids.shape[:-1])
+    out_shape.append(w_shape[-1])
+    out = weights[flat_ids].reshape(out_shape)
+    return out
+
+
+def _get_grad(weights, ids, flat_ids):
+    w_shape = weights.shape
+    w_grad = np.zeros((w_shape), dtype=weights.dtype)
+    out_grad_shape = (np.prod(ids.shape[:-1]), w_shape[-1])
+    out_grad = weights[flat_ids].reshape(out_grad_shape)
+    for i, idx in enumerate(flat_ids):
+        w_grad[idx, :] += out_grad[i]
+    return w_grad
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        self.dtype = np.uint16
+
+        table = np.random.random((17, 31)).astype("float32")
+        self.ids = np.random.randint(0, 17, (4, 1)).astype("int64")
+        self.flat_ids = self.ids.flatten()
+
+        self.w_bf16 = convert_float_to_uint16(table)
+        self.out_bf16 = _lookup(self.w_bf16, self.ids, self.flat_ids)
+        self.out_fp32 = _lookup(table, self.ids, self.flat_ids)
+        self.w_grad_fp32 = _get_grad(table, self.ids, self.flat_ids)
+
+        self.inputs = {'W': self.w_bf16, 'Ids': self.ids}
+        self.outputs = {'Out': self.out_fp32}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            core.CPUPlace(), ['W'],
+            'Out',
+            no_grad_set=set('Ids'),
+            check_dygraph=False,
+            max_relative_error=1.5e-2,
+            user_defined_grads=[self.w_grad_fp32],
+            user_defined_grad_outputs=[self.out_bf16])
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpIds4D(TestLookupTableBF16Op):
+    def setUp(self):
+        super(TestLookupTableBF16OpIds4D, self).setUp()
+        self.ids = np.random.randint(0, 17, (2, 4, 5, 1)).astype("int64")
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpWIsSelectedRows(unittest.TestCase):
+    def setUp(self):
+        self.ids = np.random.randint(
+            low=0, high=15, size=(10, 1)).astype("int64")
+        self.flat_ids = self.ids.flatten()
+        self.w_fp32 = np.random.random((15, 32)).astype("float32")
+        self.w_bf16 = convert_float_to_uint16(self.w_fp32)
+        self.scope = core.Scope()
+        self.place = core.CPUPlace()
+
+    def prepare_w(self):
+        rows = [a for a in range(self.w_bf16.shape[0])]
+        row_numel = self.w_bf16.shape[1]
+
+        w_selected_rows = self.scope.var('W').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(self.w_bf16, self.place)
+
+    def prepare_ids(self):
+        ids_tensor = self.scope.var('Ids').get_tensor()
+        ids_tensor.set(self.ids, self.place)
+
+    def _check_output(self, reference, result_array):
+        result_array_fp32 = convert_uint16_to_float(result_array)
+        np.testing.assert_allclose(result_array_fp32, reference, rtol=1.5e-2)
+
+    def test_check_output(self):
+        self.prepare_ids()
+        self.prepare_w()
+        out_tensor = self.scope.var('Out').get_tensor()
+
+        # create and run lookup_table operator
+        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
+        lookup_table.run(self.scope, self.place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+        ref = _lookup(self.w_fp32, self.ids, self.flat_ids)
+        self._check_output(ref, result_array)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpWIsSelectedRows4DIds(
+        TestLookupTableBF16OpWIsSelectedRows):
+    def setUp(self):
+        super(TestLookupTableBF16OpWIsSelectedRows4DIds, self).setUp()
+        self.ids = np.random.randint(
+            low=0, high=15, size=(3, 4, 5, 1)).astype("int64")
+        self.flat_ids = self.ids.flatten()
+
+
+@skip_check_grad_ci(
+    reason="Since paddings are not trainable and fixed in forward,"
+    "the gradient of paddings makes no sense and we don't "
+    "test the gradient here.")
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpWithPadding(TestLookupTableBF16Op):
+    def test_check_output(self):
+        ids = np.squeeze(self.inputs['Ids'])
+        padding_idx = np.random.choice(ids, 1)[0]
+        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
+        self.attrs = {'padding_idx': int(padding_idx)}
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
+
+
+@skip_check_grad_ci(
+    reason="Since paddings are not trainable and fixed in forward,"
+    "the gradient of paddings makes no sense and we don't "
+    "test the gradient here.")
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestLookupTableBF16OpIds4DPadding(TestLookupTableBF16OpIds4D):
+    def test_check_output(self):
+        ids = self.inputs['Ids']
+        flatten_idx = ids.flatten()
+        padding_idx = np.random.choice(flatten_idx, 1)[0]
+        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
+        self.attrs = {'padding_idx': int(padding_idx)}
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
+
+
+class TestEmbeddingLayerBF16ConstantInitializer(unittest.TestCase):
+    """
+    Test embedding layer api and results for bfloat16
+    """
+
+    def set_initializer(self):
+        self.initializer = fluid.initializer.Constant(value=self.value)
+
+    def setUp(self):
+        self.ids_shape = [4, 1]
+        self.w_shape = [10, 64]
+        self.ids = np.random.randint(
+            low=0, high=9, size=self.ids_shape).astype("int64")
+        self.flat_ids = self.ids.flatten()
+        self.value = 3.0
+        self.w_fp32 = np.full(self.w_shape, self.value)
+        self.place = fluid.CPUPlace()
+        self.prog = fluid.Program()
+        self.startup_prog = fluid.Program()
+        self.set_initializer()
+
+        with fluid.program_guard(self.prog, self.startup_prog):
+            x = fluid.layers.data(name='x', shape=self.ids_shape, dtype='int64')
+            self.emb = fluid.layers.embedding(
+                input=x,
+                size=self.w_shape,
+                param_attr=fluid.ParamAttr(
+                    name="emb_weight", initializer=self.initializer),
+                is_sparse=False,
+                dtype="uint16")  # bfloat16
+        exe = fluid.Executor(self.place)
+        exe.run(self.startup_prog)
+        self.result = exe.run(self.prog,
+                              feed={'x': self.ids},
+                              fetch_list=['emb_weight', self.emb])
+
+    def test_embedding_weights(self):
+        result = convert_uint16_to_float(self.result[0])
+        self.assertTrue(np.array_equal(self.w_fp32, result))
+
+    def test_lookup_results(self):
+        lookup_result = convert_uint16_to_float(self.result[1])
+        lookup_ref = _lookup(self.w_fp32, self.ids, self.flat_ids)
+        self.assertTrue(np.array_equal(lookup_result, lookup_ref))
+
+
+if __name__ == "__main__":
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 8c6383cd6ef5230e350cc68bf269a641f7014339..04a0d47e47c86ba82712a162e8ef133418bf0886 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -537,6 +537,18 @@ class TestLRScheduler(unittest.TestCase):
                 self._test_dygraph(python_func, paddle_api, kwarg, place)
                 paddle.enable_static()
 
+    def test_linear_warmp(self):
+        natural_lr = paddle.optimizer.lr.NaturalExpDecay(
+            learning_rate=0.5, gamma=0.1)
+        natural_lr_warmup = paddle.optimizer.lr.LinearWarmup(
+            learning_rate=natural_lr, warmup_steps=10, start_lr=0.0, end_lr=0.1)
+        for idx in range(30):
+            if idx >= 10:
+                self.assertEqual(natural_lr_warmup.get_lr(),
+                                 natural_lr.get_lr())
+                natural_lr.step()
+            natural_lr_warmup.step()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
index 82443f8c5493b4d10d039f7cd602e7d49428f24e..372b8d0d4d2766cefb5b1e7aad35ff798d41df51 100644
--- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py
@@ -390,8 +390,10 @@ class TestCUDNNLstmOp(OpTest):
 
     def setUp(self):
         self.op_type = "cudnn_lstm"
-        self.dtype = np.float64
-        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.sequence_length = None if core.is_compiled_with_rocm(
+        ) else np.array(
+            [12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
         self.set_attrs()
 
@@ -447,6 +449,13 @@ class TestCUDNNLstmOp(OpTest):
                            hidden_size)).astype(self.dtype)
         state_out = np.ndarray((300)).astype("uint8")
 
+        if core.is_compiled_with_rocm():
+            for i in range(len(flat_w)):
+                w = np.split(flat_w[i][1], 4, 0)
+                w = [w[0], w[1], w[3], w[2]]
+                w = np.concatenate(w)
+                flat_w[i] = (flat_w[i][0], w)
+
         self.inputs = {
             'Input': input,
             'WeightList': flat_w,
@@ -454,6 +463,13 @@ class TestCUDNNLstmOp(OpTest):
             'InitC': init_c,
             'SequenceLength': self.sequence_length
         }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'InitH': init_h,
+                'InitC': init_c,
+            }
         self.attrs = {
             'dropout_prob': 0.0,
             'is_bidirec': False,
@@ -474,8 +490,12 @@ class TestCUDNNLstmOp(OpTest):
 
     def test_output_with_place(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(
-            place, no_check_set=['Reserve', 'StateOut'])
+        if core.is_compiled_with_rocm():
+            self.check_output_with_place(
+                place, atol=1e-5, no_check_set=['Reserve', 'StateOut'])
+        else:
+            self.check_output_with_place(
+                place, no_check_set=['Reserve', 'StateOut'])
 
     def test_grad_with_place(self):
         place = core.CUDAPlace(0)
@@ -496,14 +516,13 @@ class TestCUDNNlstmAPI(unittest.TestCase):
         hidden_size = 20
         dropout_prob = 0.0
         num_layers = 1
+        dtype = 'float32' if core.is_compiled_with_rocm() else 'float64'
         input = fluid.data(
-            name='input',
-            shape=[seq_len, batch_size, hidden_size],
-            dtype='float64')
+            name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype)
         init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
         init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
         rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
                                               hidden_size, num_layers,
                                               dropout_prob, False)
@@ -526,14 +545,13 @@ class TestCUDNNlstmAPI(unittest.TestCase):
         hidden_size = 20
         dropout_prob = 0.0
         num_layers = 2
+        dtype = 'float32' if core.is_compiled_with_rocm() else 'float64'
         input = fluid.data(
-            name='input',
-            shape=[seq_len, batch_size, hidden_size],
-            dtype='float64')
+            name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype)
         init_h = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
         init_c = layers.fill_constant([num_layers, batch_size, hidden_size],
-                                      'float64', 0.0)
+                                      dtype, 0.0)
         rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len,
                                               hidden_size, num_layers,
                                               dropout_prob, False, True)
@@ -541,7 +559,7 @@ class TestCUDNNlstmAPI(unittest.TestCase):
         exe.run(fluid.default_startup_program())
         input_i = np.random.uniform(
             low=-0.1, high=0.1, size=(seq_len, batch_size,
-                                      hidden_size)).astype("float64")
+                                      hidden_size)).astype(dtype)
         out = exe.run(fluid.default_main_program(),
                       feed={'input': input_i},
                       fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0'])
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
index e908f1a60a0028502bbacf3b0ad37c185d3f7311..4b097f6359f8862d128c568f4de0776c46190a4e 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py
@@ -354,8 +354,11 @@ class TestMathOpPatchesVarBase(unittest.TestCase):
                               [1.30058, 1.0688717, 1.4928783],
                               [1.0958099, 1.3724753, 1.8926544]])
         d = d.matmul(d.t())
-        self.assertTrue(
-            np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy()))
+        # ROCM not support cholesky
+        if not fluid.core.is_compiled_with_rocm():
+            self.assertTrue(
+                np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy(
+                )))
 
         self.assertTrue(
             np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy()))
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index 2d5f098a7fe86439d8b829a3b11cf23dfe072b77..b936567d5b5a81022991910ae3fae12a6373a3c8 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -206,6 +206,42 @@ for dim_X in (1, 2, 3):
                 api_test(dim_X, dim_Y, transose_x, transose_y)
 
 
+# Test case more batch_size and N, M, K
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y,
+                               batch_size):
+    BATCH_SIZE = 2
+    M = 3
+    N = 4
+    K = 5
+    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
+        K = 1
+    if dim_X == 1:
+        if transpose_X:
+            shape_X = [M]
+        else:
+            shape_X = [K]
+    if dim_Y == 1:
+        if transpose_Y:
+            shape_Y = [N]
+        else:
+            shape_Y = [K]
+    if dim_X >= 2:
+        if transpose_X:
+            shape_X = [K, M]
+        else:
+            shape_X = [M, K]
+    if dim_X == 3:
+        shape_X = [BATCH_SIZE] + shape_X
+    if dim_Y >= 2:
+        if transpose_Y:
+            shape_Y = [N, K]
+        else:
+            shape_Y = [K, N]
+    if dim_Y == 3:
+        shape_Y = [BATCH_SIZE] + shape_Y
+    return shape_X, shape_Y
+
+
 # Test case n-dim
 def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     M = 2
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 761d318d7b8a3d43897f31bf635884b582fcec1d..efcc0e4cfe323294df88167a6100f019cef67005 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -67,7 +67,7 @@ class TestMatMulV2Op(OpTest):
         self.trans_y = False
 
     def init_kernel_type(self):
-        self.dtype = "float64"
+        self.dtype = "float32" if core.is_compiled_with_rocm() else "float64"
 
     def setUp(self):
         self.init_kernel_type()
@@ -91,7 +91,10 @@ class TestMatMulV2Op(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        if core.is_compiled_with_rocm():
+            self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
+        else:
+            self.check_grad(['X', 'Y'], 'Out')
 
 
 class TestMatMuklOp2(TestMatMulV2Op):
diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py
index cf273876b1f2f8a9b4828375ca6e20e591feb306..bea2f6c8b38b23d35a283cf4403303025f328179 100644
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -17,6 +17,8 @@ TestCases for Monitor
 
 from __future__ import print_function
 import paddle
+paddle.enable_static()
+
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy as np
@@ -52,6 +54,11 @@ class TestDatasetWithStat(unittest.TestCase):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
+        embs = []
+        for x in slots_vars:
+            emb = fluid.layers.embedding(x, is_sparse=True, size=[100001, 4])
+            embs.append(emb)
+
         dataset = paddle.distributed.InMemoryDataset()
         dataset._set_batch_size(32)
         dataset._set_thread(3)
@@ -74,11 +81,17 @@ class TestDatasetWithStat(unittest.TestCase):
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(fluid.default_main_program(), feed=data)
+
         else:
             for i in range(self.epoch_num):
                 try:
-                    exe.train_from_dataset(fluid.default_main_program(),
-                                           dataset)
+                    exe.train_from_dataset(
+                        fluid.default_main_program(),
+                        dataset,
+                        fetch_list=[embs[0], embs[1]],
+                        fetch_info=["emb0", "emb1"],
+                        print_period=1)
+
                 except Exception as e:
                     self.assertTrue(False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index bc5d35d3254bc40ed357f31ad52dad69ba5e6bd1..89eef6ca24243c501377632aefd1875868bc4ab8 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -191,8 +191,10 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(
+                    name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(
+                    name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'mean')
 
             exe = paddle.static.Executor(place)
@@ -225,8 +227,10 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(
+                    name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(
+                    name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'sum')
 
                 exe = paddle.static.Executor(place)
@@ -259,8 +263,10 @@ class TestNNFunctionalMseLoss(unittest.TestCase):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(
+                    name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(
+                    name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'none')
 
                 exe = paddle.static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
index 39fc965e5ede32a208f9a50510ed8cd5217743bf..977882543a8886c38c6ed98290462664b397d013 100755
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
@@ -273,5 +273,62 @@ class TestNumpyMixTensorDataset(TestTensorDataset):
                 assert isinstance(label, paddle.Tensor)
 
 
+class ComplextDataset(Dataset):
+    def __init__(self, sample_num):
+        self.sample_num = sample_num
+
+    def __len__(self):
+        return self.sample_num
+
+    def __getitem__(self, idx):
+        return (3.1, 'abc', paddle.to_tensor(
+            np.random.random([IMAGE_SIZE]).astype('float32'),
+            place=paddle.CPUPlace()),
+                [1, np.random.random([2]).astype('float32')], {
+                    'a': 2.0,
+                    'b': np.random.random([2]).astype('float32')
+                })
+
+
+class TestComplextDataset(unittest.TestCase):
+    def run_main(self, num_workers):
+        paddle.static.default_startup_program().random_seed = 1
+        paddle.static.default_main_program().random_seed = 1
+        place = paddle.CPUPlace()
+        with fluid.dygraph.guard(place):
+            dataset = ComplextDataset(16)
+            assert len(dataset) == 16
+            dataloader = DataLoader(
+                dataset,
+                places=place,
+                num_workers=num_workers,
+                batch_size=2,
+                drop_last=True)
+
+            for i, data in enumerate(dataloader()):
+                assert len(data) == 5
+                # data[0]: collate 3.1
+                assert data[0].shape == [2]
+                assert isinstance(data[1], list)
+                # data[1]: collate 'abc'
+                assert len(data[1]) == 2
+                assert isinstance(data[1][0], str)
+                assert isinstance(data[1][1], str)
+                # data[2]: collate tensor
+                assert data[2].shape == [2, IMAGE_SIZE]
+                # data[3]: collate list
+                assert isinstance(data[3], list)
+                assert data[3][0].shape == [2]
+                assert data[3][1].shape == [2, 2]
+                # data[4]: collate dict
+                assert isinstance(data[4], dict)
+                assert data[4]['a'].shape == [2]
+                assert data[4]['b'].shape == [2, 2]
+
+    def test_main(self):
+        for num_workers in [0, 2]:
+            self.run_main(num_workers)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
index 0533a0d09fa0de2e71360dcc92d3c3db52427f83..3bb3e843b1b11a6411701445199505dacd1e324c 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -160,5 +160,6 @@ class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader):
         print("time cost", ret['time'], 'step_list', ret['step'])
         return ret
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
index 562051335850a5b665580981d2c41a20c8fe7575..d2b7971a85dd093ae005894a300af88380dcb368 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py
@@ -58,7 +58,7 @@ class TestDynamicDataLoaderIterSplit(unittest.TestCase):
 
             rets = []
             for d in dataloader:
-                rets.append(d[0].numpy()[0][0])
+                rets.append(d.numpy()[0][0])
 
             assert tuple(sorted(rets)) == tuple(range(0, 10))
 
@@ -102,7 +102,7 @@ class TestDynamicDataLoaderIterInitFuncSplit(unittest.TestCase):
 
             rets = []
             for d in dataloader:
-                rets.append(d[0].numpy()[0][0])
+                rets.append(d.numpy()[0][0])
 
             assert tuple(sorted(rets)) == tuple(range(0, 10))
 
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index 8fd250f2a52c27e2e2fa6a6a6c917e55af292d6c..f5bccf7ab09b6323b933a46c2408d3ea91a1c9f0 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -101,7 +101,7 @@ def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True):
 
 
 class TestStaticDataLoader(unittest.TestCase):
-    def run_main(self, num_workers, places):
+    def run_main(self, num_workers, places, use_pe=True):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             startup_prog, main_prog, image, label, loss = simple_fc_net_static()
@@ -120,10 +120,13 @@ class TestStaticDataLoader(unittest.TestCase):
             exe = fluid.Executor(place=places[0])
             exe.run(startup_prog)
 
-            prog = fluid.CompiledProgram(main_prog)
-            if len(places) > 1:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
+            if use_pe:
+                prog = fluid.CompiledProgram(main_prog)
+                if len(places) > 1:
+                    prog = prog.with_data_parallel(
+                        loss_name=loss.name, places=places)
+            else:
+                prog = main_prog
 
             step_list = []
             loss_list = []
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
index 1f88568b5bc8edbfdcfce27f3b6d67ed1e23dfdd..04962a93c11c1ee45b7c54ec12cb31fa5be6cefa 100755
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
@@ -21,6 +21,7 @@ import paddle.fluid.core as core
 import paddle.fluid as fluid
 import paddle.nn as nn
 import paddle
+from paddle.nn.functional import interpolate
 
 
 def nearest_neighbor_interp_np(X,
@@ -526,6 +527,28 @@ class TestNearestAPI(unittest.TestCase):
             self.assertTrue(np.allclose(results[i + 1], expect_res))
 
 
+class TestNearestInterpOpAPI_dy(unittest.TestCase):
+    def test_case(self):
+        import paddle
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+        else:
+            place = core.CPUPlace()
+        with fluid.dygraph.guard(place):
+            input_data = np.random.random((2, 3, 6, 6)).astype("int64")
+            scale_np = np.array([2, 2]).astype("int64")
+            input_x = paddle.to_tensor(input_data)
+            scale = paddle.to_tensor(scale_np)
+            expect_res = nearest_neighbor_interp_np(
+                input_data, out_h=12, out_w=12, align_corners=False)
+            out = interpolate(
+                x=input_x,
+                scale_factor=scale,
+                mode="nearest",
+                align_corners=False)
+            self.assertTrue(np.allclose(out.numpy(), expect_res))
+
+
 class TestNearestInterpException(unittest.TestCase):
     def test_exception(self):
         input = fluid.data(name="input", shape=[1, 3, 6, 6], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_new_group.sh b/python/paddle/fluid/tests/unittests/test_new_group.sh
new file mode 100755
index 0000000000000000000000000000000000000000..4914183fb46f9ae1d8d7823a0151ef819346c5e7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_new_group.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1  new_group.py
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1  hybrid_parallel_communicate_group.py
diff --git a/python/paddle/fluid/tests/unittests/test_new_group_api.py b/python/paddle/fluid/tests/unittests/test_new_group_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9b80d3b431eaebcb35c145732a2d064ab2cd969
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_new_group_api.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_nccl(self):
+        self.check_with_place("collective_allreduce_new_group_api.py",
+                              "allreduce", "nccl")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index ffecec1815b1596b6366b95983b6c8e3e7e3b7b8..31704ebcd91920d6906f0350b85110af08e35c93 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -164,10 +164,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
     def test_nesterov_momentum_optimizer(self):
         init_program = framework.Program()
@@ -217,10 +217,10 @@ class TestMomentumOptimizer(unittest.TestCase):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
 
 class TestAdagradOptimizer(unittest.TestCase):
@@ -277,10 +277,10 @@ class TestAdagradOptimizer(unittest.TestCase):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
 
 class TestAdamOptimizer(unittest.TestCase):
@@ -344,8 +344,8 @@ class TestAdamOptimizer(unittest.TestCase):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 5)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
 
 class TestAdamaxOptimizer(unittest.TestCase):
@@ -409,8 +409,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 4)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
 
 class TestDpsgdOptimizer(unittest.TestCase):
@@ -509,10 +509,10 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
 
 class TestFtrlOptimizer(unittest.TestCase):
@@ -576,8 +576,8 @@ class TestFtrlOptimizer(unittest.TestCase):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 3)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
 
 class TestLookaheadOptimizer(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fdedce22469a2dfde7a6a14820182210468023a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+import paddle
+import paddle.optimizer as optimizer
+
+
+class TestOptimizerForVarBase(unittest.TestCase):
+    def setUp(self):
+        self.lr = 0.01
+
+    def run_optimizer_step_with_varbase_list_input(self, optimizer):
+        x = paddle.zeros([2, 3])
+        y = paddle.ones([2, 3])
+        x.stop_gradient = False
+
+        z = x + y
+
+        opt = optimizer(
+            learning_rate=self.lr, parameters=[x], weight_decay=0.01)
+
+        z.backward()
+        opt.step()
+
+        self.assertTrue(np.allclose(x.numpy(), np.full([2, 3], -self.lr)))
+
+    def run_optimizer_minimize_with_varbase_list_input(self, optimizer):
+        x = paddle.zeros([2, 3])
+        y = paddle.ones([2, 3])
+        x.stop_gradient = False
+
+        z = x + y
+
+        opt = optimizer(learning_rate=self.lr, parameters=[x])
+
+        z.backward()
+        opt.minimize(z)
+
+        self.assertTrue(np.allclose(x.numpy(), np.full([2, 3], -self.lr)))
+
+    def test_adam_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adam)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adam)
+
+    def test_sgd_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.SGD)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.SGD)
+
+    def test_adagrad_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adagrad)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adagrad)
+
+    def test_adamw_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.AdamW)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.AdamW)
+
+    def test_adamax_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adamax)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adamax)
+
+    def test_momentum_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Momentum)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Momentum)
+
+    def test_optimizer_with_varbase_input(self):
+        x = paddle.zeros([2, 3])
+        with self.assertRaises(TypeError):
+            optimizer.Adam(learning_rate=self.lr, parameters=x)
+
+    def test_create_param_lr_with_1_for_coverage(self):
+        x = paddle.fluid.framework.ParamBase(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="x",
+            optimize_attr={'learning_rate': 1.0})
+        x.value().get_tensor().set(
+            np.random.random((5, 10)).astype('float32'),
+            paddle.fluid.framework._current_expected_place())
+
+        y = paddle.ones([5, 10])
+        z = x + y
+        opt = optimizer.Adam(learning_rate=self.lr, parameters=[x])
+        z.backward()
+        opt.step()
+
+    def test_create_param_lr_with_no_1_value_for_coverage(self):
+        x = paddle.fluid.framework.ParamBase(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="x",
+            optimize_attr={'learning_rate': 0.12})
+        x.value().get_tensor().set(
+            np.random.random((5, 10)).astype('float32'),
+            paddle.fluid.framework._current_expected_place())
+
+        y = paddle.ones([5, 10])
+        z = x + y
+        opt = optimizer.Adam(learning_rate=self.lr, parameters=[x])
+        z.backward()
+        opt.step()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
index 69298f0f6a55d490ebae33bdff5859c5cb776cac..7caae211b7bba5751fa5bb517d4f36cff7f0f74b 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
@@ -64,7 +64,7 @@ class SimpleNetWithCond(object):
 
         return grads
 
-    def build_net(self, cond_i):
+    def build_net(self, cond_i, use_bf16=False):
         """
         pseudo code:
             sum_xy = x + y
@@ -122,13 +122,22 @@ class SimpleNetWithCond(object):
         sum_cond = fluid.layers.cond(cond_i > 1.0, cond_true, cond_false)
         sum_all = fluid.layers.sum([sum_xy, sub_yz, sum_cond])
         mean_out = fluid.layers.mean(sum_all)
+        if use_bf16:
+            import paddle.static.amp as amp
+            self.optimizer = amp.bf16.decorate_bf16(
+                self.optimizer,
+                amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(
+                    custom_fp32_list={'elementwise_add'}),
+                use_bf16_guard=False,
+                use_pure_bf16=True)
+
         self.optimizer.minimize(mean_out)
 
         fetch_list = ["param_x", "param_z"] if self.y_no_grad else [
             "param_x", "param_y", "param_z"
         ]
         fetch_list += [_append_grad_suffix_(param) for param in fetch_list]
-        return fetch_list
+        return fetch_list, self.optimizer
 
 
 class TestOptimizer(unittest.TestCase):
@@ -180,7 +189,7 @@ class TestOptimizer(unittest.TestCase):
         for key in ['x', 'y', 'z']:
             self.param_attr[key] = self.attr.copy()
 
-    def _check_grads(self):
+    def _check_grads(self, use_bf16=False):
         """
         main logic code to check the validity of apply_optimize.
         """
@@ -204,10 +213,16 @@ class TestOptimizer(unittest.TestCase):
                                 lambda: dict())
                             test_net = self.NetClass(self.optimizer, param_lr,
                                                      y_no_grad)
-                            fetch_list = test_net.build_net(cond_i)
+                            fetch_list, decorated_optimizer = test_net.build_net(
+                                cond_i, use_bf16)
+                            if use_bf16:
+                                self.optimizer = decorated_optimizer
 
                             exe = fluid.Executor(place)
                             exe.run(init_program)
+                            if use_bf16:
+                                self.optimizer.amp_init(exe.place)
+
                             # Train 2 steps to check validity
                             for batch_i in range(2):
 
@@ -222,6 +237,15 @@ class TestOptimizer(unittest.TestCase):
                                                                param_grads[i])
 
 
+@unittest.skipIf(not fluid.core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestSGDOptimizer(TestOptimizer):
+    def test_optimizer_multiblock_except(self):
+        with self.assertRaisesRegexp(ValueError,
+                                     "var param_y not in this block"):
+            self._check_grads(use_bf16=True)
+
+
 class TestAdamOptimizer(TestOptimizer):
     """
     inherit TestOptimizer and shall override two functions as follows:
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 06f63d1416b8f44628e28f8d18d9abca0381b22e..3a5c43b2bab3ed75dda7c2f0e8daabcb73cc786b 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -18,10 +18,16 @@ import unittest
 import numpy as np
 import os
 import sys
+import six
 
 import paddle
 import paddle.nn as nn
 import paddle.optimizer as opt
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import Adam
+import paddle.fluid.framework as framework
+from test_imperative_base import new_program_scope
+from paddle.optimizer.lr import LRScheduler
 
 BATCH_SIZE = 16
 BATCH_NUM = 4
@@ -31,7 +37,10 @@ SEED = 10
 IMAGE_SIZE = 784
 CLASS_NUM = 10
 
-LARGE_PARAM = 2**26
+if six.PY2:
+    LARGE_PARAM = 2**2
+else:
+    LARGE_PARAM = 2**26
 
 
 def random_batch_reader():
@@ -95,15 +104,22 @@ class TestSaveLoadLargeParameters(unittest.TestCase):
 
         path = os.path.join("test_paddle_save_load_large_param_save",
                             "layer.pdparams")
-        paddle.save(layer.state_dict(), path)
+        if six.PY2:
+            protocol = 2
+        else:
+            protocol = 4
+        paddle.save(save_dict, path, protocol=protocol)
         dict_load = paddle.load(path)
         # compare results before and after saving
         for key, value in save_dict.items():
-            self.assertTrue(np.array_equal(dict_load[key], value.numpy()))
+            self.assertTrue(
+                np.array_equal(dict_load[key].numpy(), value.numpy()))
 
 
 class TestSaveLoadPickle(unittest.TestCase):
     def test_pickle_protocol(self):
+        # enable dygraph mode
+        paddle.disable_static()
         # create network
         layer = LinearNet()
         save_dict = layer.state_dict()
@@ -124,11 +140,625 @@ class TestSaveLoadPickle(unittest.TestCase):
         if sys.version_info.major >= 3 and sys.version_info.minor >= 4:
             protocols += [3, 4]
         for protocol in protocols:
-            paddle.save(save_dict, path, protocol)
+            paddle.save(save_dict, path, pickle_protocol=protocol)
             dict_load = paddle.load(path)
             # compare results before and after saving
             for key, value in save_dict.items():
-                self.assertTrue(np.array_equal(dict_load[key], value.numpy()))
+                self.assertTrue(
+                    np.array_equal(dict_load[key].numpy(), value.numpy()))
+
+
+class TestSaveLoadAny(unittest.TestCase):
+    def set_zero(self, prog, place, scope=None):
+        if scope is None:
+            scope = fluid.global_scope()
+        for var in prog.list_vars():
+            if isinstance(var, framework.Parameter) or var.persistable:
+                ten = scope.find_var(var.name).get_tensor()
+                if ten is not None:
+                    ten.set(np.zeros_like(np.array(ten)), place)
+                    new_t = np.array(scope.find_var(var.name).get_tensor())
+                    self.assertTrue(np.sum(np.abs(new_t)) == 0)
+
+    def replace_static_save(self, program, model_path, pickle_protocol=2):
+        with self.assertRaises(TypeError):
+            program.state_dict(1)
+        with self.assertRaises(TypeError):
+            program.state_dict(scope=1)
+        with self.assertRaises(ValueError):
+            program.state_dict('x')
+        state_dict_param = program.state_dict('param')
+        paddle.save(state_dict_param, model_path + '.pdparams')
+        state_dict_opt = program.state_dict('opt')
+        paddle.save(state_dict_opt, model_path + '.pdopt')
+        state_dict_all = program.state_dict()
+        paddle.save(state_dict_opt, model_path + '.pdall')
+
+    def replace_static_load(self, program, model_path):
+        with self.assertRaises(TypeError):
+            program.set_state_dict(1)
+        state_dict_param = paddle.load(model_path + '.pdparams')
+        state_dict_param['fake_var_name.@@'] = np.random.randn(1, 2)
+        state_dict_param['static_x'] = 'UserWarning'
+        program.set_state_dict(state_dict_param)
+        state_dict_param['static_x'] = np.random.randn(1, 2)
+        program.set_state_dict(state_dict_param)
+        program.set_state_dict(state_dict_param)
+        state_dict_opt = paddle.load(model_path + '.pdopt')
+        program.set_state_dict(state_dict_opt)
+
+    def test_replace_static_save_load(self):
+        paddle.enable_static()
+        with new_program_scope():
+            x = paddle.static.data(
+                name="static_x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10)
+            z = paddle.static.nn.fc(z, 10, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            opt = Adam(learning_rate=1e-3)
+            opt.minimize(loss)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            fake_inputs = np.random.randn(2, IMAGE_SIZE).astype('float32')
+            exe.run(prog, feed={'static_x': fake_inputs}, fetch_list=[loss])
+            base_map = {}
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    t = np.array(fluid.global_scope().find_var(var.name)
+                                 .get_tensor())
+                    base_map[var.name] = t
+            path = os.path.join("test_replace_static_save_load", "model")
+            # paddle.save, legacy paddle.fluid.load
+            self.replace_static_save(prog, path)
+            self.set_zero(prog, place)
+            paddle.fluid.io.load(prog, path)
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+                    self.assertTrue(np.array_equal(new_t, np.array(base_t)))
+            # legacy paddle.fluid.save, paddle.load 
+            paddle.fluid.io.save(prog, path)
+            self.set_zero(prog, place)
+            self.replace_static_load(prog, path)
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+                    self.assertTrue(np.array_equal(new_t, base_t))
+            # test for return tensor
+            path_vars = 'test_replace_save_load_return_tensor_static/model'
+            for var in prog.list_vars():
+                if var.persistable:
+                    tensor = var.get_value(fluid.global_scope())
+                    paddle.save(tensor, os.path.join(path_vars, var.name))
+            with self.assertRaises(TypeError):
+                var.get_value('fluid.global_scope()')
+            with self.assertRaises(ValueError):
+                x.get_value()
+            with self.assertRaises(TypeError):
+                x.set_value('1')
+            fake_data = np.zeros([3, 2, 1, 2, 3])
+            with self.assertRaises(TypeError):
+                x.set_value(fake_data, '1')
+            with self.assertRaises(ValueError):
+                x.set_value(fake_data)
+            with self.assertRaises(ValueError):
+                var.set_value(fake_data)
+            # set var to zero
+            self.set_zero(prog, place)
+            for var in prog.list_vars():
+                if var.persistable:
+                    tensor = paddle.load(
+                        os.path.join(path_vars, var.name), return_numpy=False)
+                    var.set_value(tensor)
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+                    self.assertTrue(np.array_equal(new_t, base_t))
+
+    def test_paddle_save_load_v2(self):
+        paddle.disable_static()
+
+        class StepDecay(LRScheduler):
+            def __init__(self,
+                         learning_rate,
+                         step_size,
+                         gamma=0.1,
+                         last_epoch=-1,
+                         verbose=False):
+                self.step_size = step_size
+                self.gamma = gamma
+                super(StepDecay, self).__init__(learning_rate, last_epoch,
+                                                verbose)
+
+            def get_lr(self):
+                i = self.last_epoch // self.step_size
+                return self.base_lr * (self.gamma**i)
+
+        layer = LinearNet()
+        inps = paddle.randn([2, IMAGE_SIZE])
+        adam = opt.Adam(
+            learning_rate=StepDecay(0.1, 1), parameters=layer.parameters())
+        y = layer(inps)
+        y.mean().backward()
+        adam.step()
+        state_dict = adam.state_dict()
+        path = 'paddle_save_load_v2/model.pdparams'
+        with self.assertRaises(TypeError):
+            paddle.save(state_dict, path, use_binary_format='False')
+        # legacy paddle.save, paddle.load
+        paddle.framework.io._legacy_save(state_dict, path)
+        load_dict_tensor = paddle.load(path, return_numpy=False)
+        # legacy paddle.load, paddle.save
+        paddle.save(state_dict, path)
+        load_dict_np = paddle.framework.io._legacy_load(path)
+        for k, v in state_dict.items():
+            if isinstance(v, dict):
+                self.assertTrue(v == load_dict_tensor[k])
+            else:
+                self.assertTrue(
+                    np.array_equal(v.numpy(), load_dict_tensor[k].numpy()))
+                if not np.array_equal(v.numpy(), load_dict_np[k]):
+                    print(v.numpy())
+                    print(load_dict_np[k])
+                self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k]))
+
+    def test_single_pickle_var_dygraph(self):
+        # enable dygraph mode
+        paddle.disable_static()
+        layer = LinearNet()
+        path = 'paddle_save_load_v2/var_dygraph'
+        tensor = layer._linear.weight
+        with self.assertRaises(ValueError):
+            paddle.save(tensor, path, pickle_protocol='3')
+        with self.assertRaises(ValueError):
+            paddle.save(tensor, path, pickle_protocol=5)
+        paddle.save(tensor, path)
+        t_dygraph = paddle.load(path)
+        np_dygraph = paddle.load(path, return_numpy=True)
+        self.assertTrue(isinstance(t_dygraph, paddle.fluid.core.VarBase))
+        self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph))
+        self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy()))
+        paddle.enable_static()
+        lod_static = paddle.load(path)
+        np_static = paddle.load(path, return_numpy=True)
+        self.assertTrue(isinstance(lod_static, paddle.fluid.core.LoDTensor))
+        self.assertTrue(np.array_equal(tensor.numpy(), np_static))
+        self.assertTrue(np.array_equal(tensor.numpy(), np.array(lod_static)))
+
+    def test_single_pickle_var_static(self):
+        # enable static mode
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 128)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            for var in prog.list_vars():
+                if list(var.shape) == [IMAGE_SIZE, 128]:
+                    tensor = var.get_value()
+                    break
+            scope = fluid.global_scope()
+        origin_tensor = np.array(tensor)
+        path = 'test_single_pickle_var_static/var'
+        paddle.save(tensor, path)
+        self.set_zero(prog, place, scope)
+        # static load
+        lod_static = paddle.load(path)
+        np_static = paddle.load(path, return_numpy=True)
+        # set_tensor(np.ndarray)
+        var.set_value(np_static, scope)
+        self.assertTrue(np.array_equal(origin_tensor, np.array(tensor)))
+        # set_tensor(LoDTensor)
+        self.set_zero(prog, place, scope)
+        var.set_value(lod_static, scope)
+        self.assertTrue(np.array_equal(origin_tensor, np.array(tensor)))
+        # enable dygraph mode
+        paddle.disable_static()
+        var_dygraph = paddle.load(path)
+        np_dygraph = paddle.load(path, return_numpy=True)
+        self.assertTrue(np.array_equal(np.array(tensor), np_dygraph))
+        self.assertTrue(np.array_equal(np.array(tensor), var_dygraph.numpy()))
+
+    def test_dygraph_save_static_load(self):
+        inps = np.random.randn(1, IMAGE_SIZE).astype('float32')
+        path = 'test_dygraph_save_static_load/dy-static.pdparams'
+        paddle.disable_static()
+        with paddle.utils.unique_name.guard():
+            layer = LinearNet()
+            state_dict_dy = layer.state_dict()
+            paddle.save(state_dict_dy, path)
+        paddle.enable_static()
+        with new_program_scope():
+            layer = LinearNet()
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, IMAGE_SIZE), dtype='float32')
+            y_static = layer(data)
+            program = paddle.static.default_main_program()
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            exe.run(paddle.static.default_startup_program())
+            state_dict = paddle.load(path, keep_name_table=True)
+            program.set_state_dict(state_dict)
+            state_dict_param = program.state_dict("param")
+            for name, tensor in state_dict_dy.items():
+                self.assertTrue(
+                    np.array_equal(tensor.numpy(),
+                                   np.array(state_dict_param[tensor.name])))
+
+    def test_save_load_complex_object_dygraph_save(self):
+        paddle.disable_static()
+        layer = paddle.nn.Linear(3, 4)
+        state_dict = layer.state_dict()
+        obj1 = [
+            paddle.randn(
+                [3, 4], dtype='float32'), np.random.randn(5, 6),
+            ('fake_weight', np.ones(
+                [7, 8], dtype='float32'))
+        ]
+        obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
+        obj3 = (paddle.randn(
+            [5, 4], dtype='float32'), np.ndarray(
+                [3, 4], dtype="float32"), {
+                    "state_dict": state_dict,
+                    "opt": state_dict
+                })
+        obj4 = (np.random.randn(5, 6), (123, ))
+
+        path1 = "test_save_load_any_complex_object_dygraph/obj1"
+        path2 = "test_save_load_any_complex_object_dygraph/obj2"
+        path3 = "test_save_load_any_complex_object_dygraph/obj3"
+        path4 = "test_save_load_any_complex_object_dygraph/obj4"
+        paddle.save(obj1, path1)
+        paddle.save(obj2, path2)
+        paddle.save(obj3, path3)
+        paddle.save(obj4, path4)
+
+        load_tensor1 = paddle.load(path1, return_numpy=False)
+        load_tensor2 = paddle.load(path2, return_numpy=False)
+        load_tensor3 = paddle.load(path3, return_numpy=False)
+        load_tensor4 = paddle.load(path4, return_numpy=False)
+
+        self.assertTrue(
+            np.array_equal(load_tensor1[0].numpy(), obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_tensor1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1]))
+        for i in range(len(load_tensor1)):
+            self.assertTrue(
+                type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), load_tensor2['k2'][k].numpy()))
+        self.assertTrue(load_tensor2['epoch'] == 123)
+
+        self.assertTrue(
+            np.array_equal(load_tensor3[0].numpy(), obj3[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
+                               v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_tensor3[2]["opt"][k].numpy(), v.numpy()))
+
+        self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
+
+        load_array1 = paddle.load(path1, return_numpy=True)
+        load_array2 = paddle.load(path2, return_numpy=True)
+        load_array3 = paddle.load(path3, return_numpy=True)
+        load_array4 = paddle.load(path4, return_numpy=True)
+
+        self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+        for i in range(len(load_array1)):
+            self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k]))
+        self.assertTrue(load_array2['epoch'] == 123)
+
+        self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy()))
+        self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_array3[2]["state_dict"][k], v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy()))
+
+        self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+        # static mode
+        paddle.enable_static()
+
+        load_tensor1 = paddle.load(path1, return_numpy=False)
+        load_tensor2 = paddle.load(path2, return_numpy=False)
+        load_tensor3 = paddle.load(path3, return_numpy=False)
+        load_tensor4 = paddle.load(path4, return_numpy=False)
+
+        self.assertTrue(
+            np.array_equal(np.array(load_tensor1[0]), obj1[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+        self.assertTrue(np.array_equal(np.array(load_tensor1[2]), obj1[2][1]))
+
+        for i in range(len(load_tensor1)):
+            self.assertTrue(
+                type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), np.array(load_tensor2['k2'][k])))
+        self.assertTrue(load_tensor2['epoch'] == 123)
+
+        self.assertTrue(
+            isinstance(load_tensor3[0], paddle.fluid.core.LoDTensor))
+        self.assertTrue(
+            np.array_equal(np.array(load_tensor3[0]), obj3[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                isinstance(load_tensor3[2]["state_dict"][k],
+                           paddle.fluid.core.LoDTensor))
+            self.assertTrue(
+                np.array_equal(
+                    np.array(load_tensor3[2]["state_dict"][k]), v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                isinstance(load_tensor3[2]["opt"][k],
+                           paddle.fluid.core.LoDTensor))
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor3[2]["opt"][k]), v.numpy()))
+
+        self.assertTrue(load_tensor4[0], paddle.fluid.core.LoDTensor)
+        self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0]))
+
+        load_array1 = paddle.load(path1, return_numpy=True)
+        load_array2 = paddle.load(path2, return_numpy=True)
+        load_array3 = paddle.load(path3, return_numpy=True)
+        load_array4 = paddle.load(path4, return_numpy=True)
+
+        self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+        for i in range(len(load_array1)):
+            self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k]))
+        self.assertTrue(load_array2['epoch'] == 123)
+
+        self.assertTrue(isinstance(load_array3[0], np.ndarray))
+        self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy()))
+        self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_array3[2]["state_dict"][k], v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy()))
+
+        self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+    def test_save_load_complex_object_static_save(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            prog = paddle.static.default_main_program()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+
+            state_dict = prog.state_dict()
+            keys = list(state_dict.keys())
+            obj1 = [
+                state_dict[keys[0]], np.random.randn(5, 6),
+                ('fake_weight', np.ones(
+                    [7, 8], dtype='float32'))
+            ]
+            obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
+            obj3 = (state_dict[keys[0]], np.ndarray(
+                [3, 4], dtype="float32"), {
+                    "state_dict": state_dict,
+                    "opt": state_dict
+                })
+            obj4 = (np.ndarray([3, 4], dtype="float32"), )
+
+            path1 = "test_save_load_any_complex_object_static/obj1"
+            path2 = "test_save_load_any_complex_object_static/obj2"
+            path3 = "test_save_load_any_complex_object_static/obj3"
+            path4 = "test_save_load_any_complex_object_static/obj4"
+            paddle.save(obj1, path1)
+            paddle.save(obj2, path2)
+            paddle.save(obj3, path3)
+            paddle.save(obj4, path4)
+
+            load_tensor1 = paddle.load(path1, return_numpy=False)
+            load_tensor2 = paddle.load(path2, return_numpy=False)
+            load_tensor3 = paddle.load(path3, return_numpy=False)
+            load_tensor4 = paddle.load(path4, return_numpy=False)
+
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0])))
+            self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[2]), obj1[2][1]))
+            for i in range(len(load_tensor1)):
+                self.assertTrue(
+                    type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(v), np.array(load_tensor2['k2'][k])))
+            self.assertTrue(load_tensor2['epoch'] == 123)
+
+            self.assertTrue(isinstance(load_tensor3[0], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor3[0]), obj3[0]))
+            self.assertTrue(isinstance(load_tensor3[1], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["state_dict"][k],
+                               fluid.core.LoDTensor))
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(load_tensor3[2]["state_dict"][k]), np.array(
+                            v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["opt"][k], fluid.core.LoDTensor))
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(load_tensor3[2]["opt"][k]), np.array(v)))
+
+            self.assertTrue(isinstance(load_tensor4[0], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0]))
+
+            load_array1 = paddle.load(path1, return_numpy=True)
+            load_array2 = paddle.load(path2, return_numpy=True)
+            load_array3 = paddle.load(path3, return_numpy=True)
+            load_array4 = paddle.load(path4, return_numpy=True)
+
+            self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0])))
+            self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+            self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+            for i in range(len(load_array1)):
+                self.assertTrue(
+                    type(load_array1[i]) == type(load_array2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(np.array(v), load_array2['k2'][k]))
+            self.assertTrue(load_array2['epoch'] == 123)
+
+            self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0])))
+            self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
+                        v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["opt"][k], np.array(v)))
+
+            self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+            # dygraph mode
+            paddle.disable_static()
+
+            load_tensor1 = paddle.load(path1, return_numpy=False)
+            load_tensor2 = paddle.load(path2, return_numpy=False)
+            load_tensor3 = paddle.load(path3, return_numpy=False)
+            load_tensor4 = paddle.load(path4, return_numpy=False)
+
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0])))
+            self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+            self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1]))
+            for i in range(len(load_tensor1)):
+                self.assertTrue(
+                    type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(v), np.array(load_tensor2['k2'][k])))
+            self.assertTrue(load_tensor2['epoch'] == 123)
+
+            self.assertTrue(isinstance(load_tensor3[0], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0]))
+            self.assertTrue(isinstance(load_tensor3[1], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["state_dict"][k],
+                               fluid.core.VarBase))
+                self.assertTrue(
+                    np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
+                                   np.array(v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["opt"][k], fluid.core.VarBase))
+                self.assertTrue(
+                    np.array_equal(load_tensor3[2]["opt"][k].numpy(),
+                                   np.array(v)))
+
+            self.assertTrue(isinstance(load_tensor4[0], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
+
+            load_array1 = paddle.load(path1, return_numpy=True)
+            load_array2 = paddle.load(path2, return_numpy=True)
+            load_array3 = paddle.load(path3, return_numpy=True)
+            load_array4 = paddle.load(path4, return_numpy=True)
+
+            self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0])))
+            self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+            self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+            for i in range(len(load_array1)):
+                self.assertTrue(
+                    type(load_array1[i]) == type(load_array2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(np.array(v), load_array2['k2'][k]))
+            self.assertTrue(load_array2['epoch'] == 123)
+
+            self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0])))
+            self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
+                        v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["opt"][k], np.array(v)))
+
+            self.assertTrue(isinstance(load_array4[0], np.ndarray))
+            self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+    def test_varbase_binary_var(self):
+        paddle.disable_static()
+        varbase = paddle.randn([3, 2], dtype='float32')
+        path = 'test_paddle_save_load_varbase_binary_var/varbase'
+        paddle.save(varbase, path, use_binary_format=True)
+        load_array = paddle.load(path, return_numpy=True)
+        load_tensor = paddle.load(path, return_numpy=False)
+        origin_array = varbase.numpy()
+        load_tensor_array = load_tensor.numpy()
+        if paddle.fluid.core.is_compiled_with_cuda():
+            fluid.core._cuda_synchronize(paddle.CUDAPlace(0))
+        self.assertTrue(np.array_equal(origin_array, load_array))
+        self.assertTrue(np.array_equal(origin_array, load_tensor_array))
 
 
 class TestSaveLoad(unittest.TestCase):
@@ -158,7 +788,9 @@ class TestSaveLoad(unittest.TestCase):
 
     def check_load_state_dict(self, orig_dict, load_dict):
         for var_name, value in orig_dict.items():
-            self.assertTrue(np.array_equal(value.numpy(), load_dict[var_name]))
+            load_value = load_dict[var_name].numpy() if hasattr(
+                load_dict[var_name], 'numpy') else np.array(load_dict[var_name])
+            self.assertTrue(np.array_equal(value.numpy(), load_value))
 
     def test_save_load(self):
         layer, opt = self.build_and_train_model()
@@ -189,8 +821,6 @@ class TestSaveLoad(unittest.TestCase):
         # error test cases, some tests relay base test above
         # 1. test save obj not dict error
         test_list = [1, 2, 3]
-        with self.assertRaises(NotImplementedError):
-            paddle.save(test_list, "not_dict_error_path")
 
         # 2. test save path format error
         with self.assertRaises(ValueError):
@@ -205,5 +835,57 @@ class TestSaveLoad(unittest.TestCase):
             paddle.load("test_paddle_save_load.linear")
 
 
+class TestSaveLoadProgram(unittest.TestCase):
+    def test_save_load_program(self):
+        paddle.enable_static()
+        with new_program_scope():
+            layer = LinearNet()
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, IMAGE_SIZE), dtype='float32')
+            y_static = layer(data)
+            main_program = paddle.static.default_main_program()
+            startup_program = paddle.static.default_startup_program()
+            origin_main = main_program.desc.serialize_to_string()
+            origin_startup = startup_program.desc.serialize_to_string()
+            path1 = "test_paddle_save_load_program/main_program.pdmodel"
+            path2 = "test_paddle_save_load_program/startup_program.pdmodel"
+            paddle.save(main_program, path1)
+            paddle.save(startup_program, path2)
+
+        with new_program_scope():
+            load_main = paddle.load(path1).desc.serialize_to_string()
+            load_startup = paddle.load(path2).desc.serialize_to_string()
+            self.assertTrue(origin_main == load_main)
+            self.assertTrue(origin_startup == load_startup)
+
+
+class TestSaveLoadLayer(unittest.TestCase):
+    def test_save_load_layer(self):
+        if six.PY2:
+            return
+
+        paddle.disable_static()
+        inps = paddle.randn([1, IMAGE_SIZE], dtype='float32')
+        layer1 = LinearNet()
+        layer2 = LinearNet()
+        layer1.eval()
+        layer2.eval()
+        origin = (layer1(inps), layer2(inps))
+        path = "test_save_load_layer_/layer.pdmodel"
+        paddle.save((layer1, layer2), path)
+
+        # static
+        paddle.enable_static()
+        with self.assertRaises(ValueError):
+            paddle.load(path)
+        # dygraph
+        paddle.disable_static()
+
+        loaded_layer = paddle.load(path)
+        loaded_result = [l(inps) for l in loaded_layer]
+        for i in range(len(origin)):
+            self.assertTrue((origin[i] - loaded_result[i]).abs().max() < 1e-10)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b508d5c9ae79801fa6c0c865650205ca6932b4f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import os
+import sys
+import six
+
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import Adam
+import paddle.fluid.framework as framework
+from test_imperative_base import new_program_scope
+
+IMAGE_SIZE = 784
+
+
+class TestSaveLoadBinaryFormat(unittest.TestCase):
+    def setUp(self):
+        # enable static graph mode
+        paddle.enable_static()
+
+    def set_zero(self, prog, place, scope=None):
+        if scope is None:
+            scope = fluid.global_scope()
+        for var in prog.list_vars():
+            if isinstance(var, framework.Parameter) or var.persistable:
+                ten = scope.find_var(var.name).get_tensor()
+                if ten is not None:
+                    ten.set(np.zeros_like(np.array(ten)), place)
+                    new_t = np.array(scope.find_var(var.name).get_tensor())
+                    self.assertTrue(np.sum(np.abs(new_t)) == 0)
+
+    def replace_save_vars(self, program, dirname):
+        def predicate(var):
+            return var.persistable
+
+        vars = filter(predicate, program.list_vars())
+        for var in vars:
+            paddle.save(
+                var.get_value(),
+                os.path.join(dirname, var.name),
+                use_binary_format=True)
+
+    def replace_load_vars(self, program, dirname):
+        def predicate(var):
+            return var.persistable
+
+        var_list = list(filter(predicate, program.list_vars()))
+        for var in var_list:
+            var_load = paddle.load(os.path.join(dirname, var.name))
+            # set var_load to scope
+            var.set_value(var_load)
+
+    def test_replace_save_load_vars(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            base_map = {}
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    t = np.array(fluid.global_scope().find_var(var.name)
+                                 .get_tensor())
+                    # make sure all the paramerter or optimizer var have been update
+                    self.assertTrue(np.sum(np.abs(t)) != 0)
+                    base_map[var.name] = t
+            # test for replace_save_vars/io.load_vars
+            path_vars1 = 'test_replace_save_load_vars_binary1/model'
+            self.replace_save_vars(prog, path_vars1)
+            # set var to zero
+            self.set_zero(prog, place)
+            var_list = list(
+                filter(lambda var: var.persistable, prog.list_vars()))
+            fluid.io.load_vars(
+                exe, path_vars1, main_program=prog, vars=var_list)
+
+            for var in prog.list_vars():
+                if var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+
+                    self.assertTrue(np.array_equal(new_t, base_t))
+            # test for io.save_vars/replace_load_vars
+            path_vars2 = 'test_replace_save_load_vars_binary2/model/'
+            fluid.io.save_vars(
+                exe, path_vars2, main_program=prog, vars=var_list)
+            self.set_zero(prog, place)
+            self.replace_load_vars(prog, path_vars2)
+            for var in prog.list_vars():
+                if var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+
+                    self.assertTrue(np.array_equal(new_t, base_t))
+
+    def test_save_load_lod_tensor(self):
+        paddle.enable_static()
+        OUTPUT_NUM = 32
+        with new_program_scope():
+            x = fluid.data(name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            y = fluid.layers.fc(
+                x,
+                OUTPUT_NUM,
+                name='fc_vars', )
+            prog = fluid.default_main_program()
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            prog = paddle.static.default_main_program()
+            exe.run(fluid.default_startup_program())
+
+            dirname = 'test_save_load_lod_tensor1/tensor_'
+            for var in prog.list_vars():
+                if var.persistable and list(
+                        var.shape) == [IMAGE_SIZE, OUTPUT_NUM]:
+                    tensor = var.get_value()
+                    paddle.save(
+                        tensor, dirname + 'fc_vars.w_0', use_binary_format=True)
+                    break
+
+            origin = np.array(var.get_value())
+            var.set_value(np.zeros_like(origin))
+            is_zeros = np.array(var.get_value())
+
+            loaded_tensor = paddle.load(dirname + 'fc_vars.w_0')
+            self.assertTrue(isinstance(loaded_tensor, fluid.core.LoDTensor))
+            self.assertTrue(
+                list(loaded_tensor.shape()) == [IMAGE_SIZE, OUTPUT_NUM])
+            to_array = np.array(loaded_tensor)
+            self.assertTrue(np.array_equal(origin, to_array))
+
+        with self.assertRaises(NotImplementedError):
+            path = 'test_save_load_error/temp'
+            paddle.save({}, path, use_binary_format=True)
+
+        with self.assertRaises(ValueError):
+            path = 'test_save_load_error/temp'
+            with open(path, "w") as f:
+                f.write('\0')
+            paddle.load(path)
+
+        with self.assertRaises(ValueError):
+            temp_lod = fluid.core.LoDTensor()
+            paddle.save(temp_lod, path, use_binary_format=True)
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._save_lod_tensor(
+                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._load_lod_tensor(
+                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+
+    def test_save_load_selected_rows(self):
+        paddle.enable_static()
+        place = fluid.CPUPlace() if not paddle.fluid.core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+        selected_rows = fluid.core.SelectedRows(rows, height)
+        path = 'test_paddle_save_load_selected_rows/sr.pdsr'
+
+        with self.assertRaises(ValueError):
+            paddle.save(selected_rows, path, use_binary_format=True)
+
+        np_array = np.random.randn(len(rows), row_numel).astype("float32")
+        tensor = selected_rows.get_tensor()
+        tensor.set(np_array, place)
+
+        paddle.save(selected_rows, path, use_binary_format=True)
+        load_sr = paddle.load(path)
+
+        self.assertTrue(isinstance(load_sr, fluid.core.SelectedRows))
+        self.assertTrue(list(load_sr.rows()) == rows)
+        self.assertTrue(load_sr.height() == height)
+        self.assertTrue(
+            np.array_equal(np.array(load_sr.get_tensor()), np_array))
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._save_selected_rows(
+                selected_rows,
+                'test_paddle_save_load_selected_rows_not_exist_file/temp')
+        with self.assertRaises(RuntimeError):
+            fluid.core._load_selected_rows(
+                selected_rows,
+                'test_paddle_save_load_selected_rows_not_exist_file/temp')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c45b2c79503776930af7d8ae1182ccefe99ebd7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import unittest
+
+import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestDygraphControlFlowSame(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._find_unused_parameters = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_control_flow_same.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestFleetDygraphControlFlowSame(TestDygraphControlFlowSame):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+        self._find_unused_parameters = True
+
+
+class TestFleetDygraphControlFlowSameAccGrad(TestDygraphControlFlowSame):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._accumulate_gradient = True
+        self._find_unused_parameters = True
+
+
+class TestDygraphControlFlowDiff(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._find_unused_parameters = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_control_flow_different.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestFleetDygraphControlFlowDiff(TestDygraphControlFlowDiff):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+        self._find_unused_parameters = True
+
+
+class TestFleetDygraphControlFlowDiffAccGrad(TestDygraphControlFlowDiff):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._accumulate_gradient = True
+        self._find_unused_parameters = True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..5491b451368c825c10f1e957d85e30ccacdd1dc7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import time
+import paddle.fluid as fluid
+
+from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, start_local_trainers
+
+
+def get_cluster_from_args(selected_gpus):
+    cluster_node_ips = '127.0.0.1'
+    node_ip = '127.0.0.1'
+
+    node_ips = [x.strip() for x in cluster_node_ips.split(',')]
+
+    node_ips.index(node_ip)
+
+    free_ports = None
+
+    free_ports = find_free_ports(len(selected_gpus))
+    if free_ports is not None:
+        free_ports = list(free_ports)
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
+
+
+def get_gpus(selected_gpus):
+    selected_gpus = [x.strip() for x in selected_gpus.split(',')]
+    return selected_gpus
+
+
+class TestMultipleGpus(unittest.TestCase):
+    def run_mnist_2gpu(self, target_file_name):
+        if not fluid.core.is_compiled_with_cuda(
+        ) or fluid.core.get_cuda_device_count() == 0:
+            return
+
+        selected_gpus = get_gpus('0,1')
+        cluster = None
+        pod = None
+
+        cluster, pod = get_cluster_from_args(selected_gpus)
+
+        procs = start_local_trainers(
+            cluster,
+            pod,
+            training_script=target_file_name,
+            training_script_args=[])
+
+        while True:
+            alive = watch_local_trainers(procs, cluster.trainers_nranks())
+
+            if not alive:
+                print("Local procs complete, POD info:{}".format(pod))
+                break
+            time.sleep(3)
+
+    def test_multiple_gpus_dynamic(self):
+        self.run_mnist_2gpu('parallel_dygraph_gradient_check.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b9d6764bbb3b6d694de47bb8783b4cfa6fe8d91
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestHybridParallel(TestMultipleGpus):
+    def test_hybrid_parallel_mp_random(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
+
+    def test_hybrid_parallel_mp_model(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_model.py')
+
+    def test_hybrid_parallel_mp_amp(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_amp.py')
+
+    def test_hybrid_parallel_mp_clip_grad(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_clip_grad.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index a3a3c5bfe3df59b9e0094b67db708d377865e6ed..0c55e135721ce8ca29eb2710f2cfb244785960d5 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -31,6 +31,7 @@ class TestParallelDygraphMnist(TestDistBase):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
+        self._find_unused_parameters = True
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
@@ -73,6 +74,7 @@ class TestParallelDygraphMnistAccGrad(TestDistBase):
         self._dygraph = True
         self._use_fleet_api = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = False
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0a2770852b63c519b0a7706ff54ce633e460e28
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestModelParallelLayer(TestMultipleGpus):
+    def test_hybrid_parallel_mp_layer(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3b89d694f70b96df70f4923b5af3433c7e2e26c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestHybridPipeParallel(TestMultipleGpus):
+    def test_hybrid_parallel_pp_layer(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
index bef64385f135b3d2177bce6a65125f23e1f315e3..e0aab8541a542c9adfc3fb6c9323e7d54854578b 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
@@ -54,6 +54,7 @@ class TestParallelDygraphTransformerAccGrad(TestDistBase):
         self._nccl2_mode = True
         self._dygraph = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = False
 
     def test_transformer(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
index 5906114cd24f328d20a99f67bb8f59c73a97c30b..75fa6f7c71d0a53c238ec4a9f7bebe905017531f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
@@ -26,13 +26,13 @@ from parallel_dygraph_unused_variables import TestSparseEmbeddingUnusedVars
 flag_name = os.path.splitext(__file__)[0]
 
 
-class TestParallelDygraphMnist(TestDistBase):
+class TestParallelDygraphUnusedVar(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
 
-    def test_mnist(self):
+    def test_net(self):
         if fluid.core.is_compiled_with_cuda():
             self.check_with_place(
                 "parallel_dygraph_unused_variables.py",
@@ -41,6 +41,14 @@ class TestParallelDygraphMnist(TestDistBase):
                 log_name=flag_name)
 
 
+class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+
+
 class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner):
     def test_mnist_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
@@ -48,17 +56,31 @@ class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner):
                 test_class=TestSparseEmbeddingUnusedVars, delta=1e-5)
 
 
-class TestFleetDygraphMnist(TestDistBase):
+class TestParallelDygraphNoVar(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_none_var.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestParallelDygraphSharedUnusedVariables(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
-        self._use_fleet_api = True
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
             self.check_with_place(
-                "parallel_dygraph_unused_variables.py",
+                "parallel_dygraph_shared_unused_var.py",
                 delta=1e-5,
                 check_error_log=True,
                 log_name=flag_name)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index ea59a7f584a2dd5a06d37ede160ace130fc93580..47d286fb6ab32938ba19a20a42b07b6d8f79638c 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -159,7 +159,7 @@ class TestCRFModel(unittest.TestCase):
                 train_data = paddle.batch(
                     paddle.reader.shuffle(
                         paddle.dataset.conll05.test(), buf_size=8192),
-                    batch_size=16)
+                    batch_size=8)
 
                 place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
                 exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
index e6d585e5bc176e495c07f6e1f8b1c777799a209f..cd592416c1a512a1fc95143efb5817b1d3a74561 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -34,9 +34,13 @@ class TestPipeline(TestDistBase):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
+            # TODO (sandyhouse) fix the delta value.
+            # Now pipeline only gets the loss value of the last
+            # microbatch, so it is not consistable with the
+            # non-pipeline one.
             self.check_with_place(
                 "pipeline_mnist.py",
-                delta=1e-5,
+                delta=1e0,
                 check_error_log=True,
                 log_name=flag_name)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f8294ad0efe7536a27024fd30dbcdda15220efd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestPipelineParallel(TestMultipleGpus):
+    def test_pipeline_parallel(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_model.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index f75d6e9df540b5bd677ba5021c8d376b62f838cd..f1a409c712fc32288a0b167c5bc36414346ea0fe 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -97,8 +97,10 @@ class TestPixelShuffleAPI(unittest.TestCase):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(
+                name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=[2, 4, 4, 9], dtype="float64")
             out_1 = F.pixel_shuffle(x_1, 3)
             out_2 = F.pixel_shuffle(x_2, 3, "NHWC")
 
@@ -123,8 +125,10 @@ class TestPixelShuffleAPI(unittest.TestCase):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(
+                name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=[2, 4, 4, 9], dtype="float64")
             # init instance
             ps_1 = paddle.nn.PixelShuffle(3)
             ps_2 = paddle.nn.PixelShuffle(3, "NHWC")
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index d618875835ffd8cbeba1bccbdfe7829a66c1ba76..40b9be9ee4f9bd69c763422fce850b0a3d6931d3 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -224,7 +224,7 @@ class TestPool3D_Op(OpTest):
     def setUp(self):
         self.op_type = "pool3d"
         self.init_kernel_type()
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.init_test_case()
         self.padding_algorithm = "EXPLICIT"
         self.init_paddings()
@@ -277,9 +277,16 @@ class TestPool3D_Op(OpTest):
             return
         if self.has_cudnn() and self.pool_type != "max":
             place = core.CUDAPlace(0)
-            self.check_grad_with_place(place, set(['X']), 'Out')
+            if core.is_compiled_with_rocm():
+                self.check_grad_with_place(
+                    place, set(['X']), 'Out', max_relative_error=1e-2)
+            else:
+                self.check_grad_with_place(place, set(['X']), 'Out')
         elif self.pool_type != "max":
-            self.check_grad(set(['X']), 'Out')
+            if core.is_compiled_with_rocm():
+                self.check_grad(set(['X']), 'Out', max_relative_error=1e-2)
+            else:
+                self.check_grad(set(['X']), 'Out')
 
     def init_data_format(self):
         self.data_format = "NCDHW"
@@ -400,7 +407,10 @@ def create_test_cudnn_fp16_class(parent):
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
-                    self.check_output_with_place(place, atol=1e-3)
+                    if core.is_compiled_with_rocm():
+                        self.check_output_with_place(place, atol=1e-2)
+                    else:
+                        self.check_output_with_place(place, atol=1e-3)
 
     cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16Op")
     TestCUDNNFp16Case.__name__ = cls_name
diff --git a/python/paddle/fluid/tests/unittests/test_prod_op.py b/python/paddle/fluid/tests/unittests/test_prod_op.py
index 15fd79542d608f666ca10b95f3771ad752c91827..cdfcbb4e4e735d0986110956ab987f99b823817a 100644
--- a/python/paddle/fluid/tests/unittests/test_prod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prod_op.py
@@ -55,7 +55,8 @@ class TestProdOp(unittest.TestCase):
         self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(
+            name='input', shape=[10, 10, 5], dtype='float32')
         result0 = paddle.prod(input)
         result1 = paddle.prod(input, axis=1)
         result2 = paddle.prod(input, axis=-1)
@@ -114,7 +115,8 @@ class TestProdOpError(unittest.TestCase):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
             x = paddle.fluid.data(name='x', shape=[2, 2, 4], dtype='float32')
-            bool_x = paddle.fluid.data(name='bool_x', shape=[2, 2, 4], dtype='bool')
+            bool_x = paddle.fluid.data(
+                name='bool_x', shape=[2, 2, 4], dtype='bool')
             # The argument x shoule be a Tensor
             self.assertRaises(TypeError, paddle.prod, [1])
 
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e058115d691993781d7f6d0fb9aa20b633ab60d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -0,0 +1,431 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.autograd import PyLayer
+
+
+class TestPyLayer(unittest.TestCase):
+    def test_simple_pylayer_multiple_output(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2, func1, func2=paddle.square):
+                ctx.func = func2
+                y1 = func1(x1)
+                y2 = func1(x2)
+                ctx.save_for_backward(y1, y2)
+                return y1, 1, y2, None
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                y1, y2 = ctx.saved_tensor()
+                re1 = dy1 * (1 - ctx.func(y1))
+                re2 = dy2 * (1 - paddle.square(y2))
+                return re1, re2
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z = tanh.apply(input1, input1, paddle.tanh, paddle.square)
+        z = z[0] + z[2]
+        z.mean().backward()
+
+        z2 = paddle.tanh(input2) + paddle.tanh(input2)
+        z2.mean().backward()
+
+        self.assertTrue(
+            np.max(np.abs((input1.grad.numpy() - input2.grad.numpy()))) < 1e-10)
+
+    def test_simple_pylayer_return_none_with_no_grad(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2, func1, func2=paddle.square):
+                ctx.func = func2
+                y1 = func1(x1)
+                y2 = func1(x2)
+                ctx.save_for_backward(y1, y2)
+                return 1, None, y1, y2, ''
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                y1, y2 = ctx.saved_tensor()
+                re1 = dy1 * (1 - ctx.func(y1))
+                re2 = dy2 * (1 - paddle.square(y2))
+                return re1, None
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input3 = input1.detach().clone()
+        input4 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        input3.stop_gradient = True
+        input4.stop_gradient = True
+        z = tanh.apply(input1, input3, paddle.tanh, paddle.square)
+        z = z[2] + z[3]
+        z.mean().backward()
+
+        z2 = paddle.tanh(input2) + paddle.tanh(input4)
+        z2.mean().backward()
+
+        self.assertTrue(
+            np.max(np.abs((input1.grad.numpy() - input2.grad.numpy()))) < 1e-10)
+
+    def test_simple_pylayer_single_output(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, func1, func2=paddle.square):
+                ctx.func = func2
+                y1 = func1(x1)
+                ctx.save_for_backward(y1)
+                return y1
+
+            @staticmethod
+            def backward(ctx, dy1):
+                y1, = ctx.saved_tensor()
+                re1 = dy1 * (1 - ctx.func(y1))
+                return re1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z = tanh.apply(x1=input1, func1=paddle.tanh)
+        z.mean().backward()
+        z2 = paddle.tanh(input2)
+        z2.mean().backward()
+
+        self.assertTrue(
+            np.max(np.abs((input1.grad.numpy() - input2.grad.numpy()))) < 1e-10)
+
+    def test_pylayer_num_output_match(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(
+                    ctx,
+                    x1,
+                    x2, ):
+                return x1 + x2
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return dy1 + 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z = tanh.apply(input1, input2)
+        with self.assertRaises(ValueError):
+            z.mean().backward()
+
+    def test_pylayer_dtype(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x, dtype):
+                y = paddle.cast(x, dtype)
+                return y
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return dy1
+
+        dtypes = [
+            'bool', 'float16', 'float32', 'float64', 'uint8', 'int32', 'int64'
+        ]
+        for dtype in dtypes:
+            input1 = (paddle.randn([2, 3]))
+            input1.stop_gradient = False
+            self.assertTrue(input1.grad is None)
+
+            z = tanh.apply(input1, dtype)
+            z = paddle.cast(z, "float32")
+            z.sum().backward()
+            self.assertTrue(input1.grad is not None)
+
+    def test_pylayer_Exception_forward(self):
+        class Layer_None1(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return None
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(ValueError):
+            z = Layer_None1.apply(input1)
+
+        class Layer_None2(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return [None, args[0]]
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        # return None
+        z = Layer_None2.apply(input1)
+
+        class Layer_one1(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return 1
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        # At least one output of `PyLayer.backward` is a `Tensor`
+        with self.assertRaises(ValueError):
+            z = Layer_one1.apply(input1)
+
+        class Layer_one2(PyLayer):
+            @staticmethod
+            def forward(ctx, *args):
+                return [1, 2, args[0]]
+
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        # return int 
+        z = Layer_one2.apply(input1)
+
+        class Layer_no_fw(PyLayer):
+            @staticmethod
+            def backward(ctx, *args):
+                return args
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        with self.assertRaises(NotImplementedError):
+            z = Layer_no_fw.apply(input1)
+
+    def test_pylayer_nograd(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, func1, func2=paddle.square, xx=None):
+                ctx.func = func2
+                y1 = func1(x1)
+                return y1
+
+            @staticmethod
+            def backward(ctx, x1, y1, dy1):
+                re1 = dy1 * (1 - ctx.func(y1))
+                return re1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        z = tanh.apply(input1, paddle.tanh, paddle.square)
+        z.mean().backward()
+        self.assertTrue(z.grad is None)
+
+    def test_pylayer_Exception_bk(self):
+        class Layer_bk_none1(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return None
+
+        input2 = paddle.randn([2, 3]).astype("float64")
+        input2.stop_gradient = False
+        z = Layer_bk_none1.apply(input2)
+
+        with self.assertRaises(ValueError):
+            z.sum().backward()
+
+        class Layer_bk_none2(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 + x2
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return None, dy1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_none2.apply(input1, input1)
+
+        with self.assertRaises(ValueError):
+            z.mean().backward()
+
+        class Layer_bk_one1(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x + x
+
+            @staticmethod
+            def backward(ctx, dy):
+                return 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_one1.apply(input1)
+
+        with self.assertRaises(ValueError):
+            z.mean().backward()
+
+        class Layer_bk_one2(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 * 2, x2 * 5
+
+            @staticmethod
+            def backward(ctx, *args):
+                return 1, 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+
+        y = Layer_bk_one2.apply(input1, input1)
+        z = y[0] + y[1]
+        with self.assertRaises(ValueError):
+            z.mean().backward()
+
+        class Layer_no_bk(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2, x * 5
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_no_bk.apply(input1)
+
+        with self.assertRaises(OSError):
+            z = z[0] + z[1]
+            z.mean().backward()
+
+        class Layer_bk_match(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x * 2, x * 5
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy2 * 2, dy1 * 2
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = False
+        z = Layer_bk_match.apply(input1)
+        with self.assertRaises(ValueError):
+            z = z[0] + z[1]
+            z.mean().backward()
+
+    def test_pylayer_bk_return_none(self):
+        class Layer_bk_none1(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 + x2
+
+            @staticmethod
+            def backward(ctx, dy):
+                return 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = True
+        input2.stop_gradient = False
+        z = Layer_bk_none1.apply(input1, input2)
+
+        with self.assertRaises(ValueError):
+            z.mean().backward()
+
+        class Layer_bk_none2(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 * 2, x2 * 5
+
+            @staticmethod
+            def backward(ctx, *args):
+                return 1, 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = True
+        input2.stop_gradient = False
+        z = Layer_bk_none2.apply(input1, input2)
+        z = z[0] + z[1]
+        with self.assertRaises(ValueError):
+            z.mean().backward()
+
+    def test_pylayer_inplace(self):
+        class cus_tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                return x
+
+            @staticmethod
+            def backward(ctx, dy):
+                return dy
+
+        class Layer(paddle.nn.Layer):
+            def __init__(self):
+                super(Layer, self).__init__()
+
+            def forward(self, data):
+                data = paddle.nn.functional.relu(data)
+                z = paddle.tanh(data)
+                z = cus_tanh.apply(data)
+                return z.mean()
+
+        for i in range(2):
+            data = paddle.ones([2, 3], dtype="float64") / (i + 1)
+            data.stop_gradient = False
+            layer = Layer()
+            z = layer(data)
+            z.backward()
+            self.assertTrue(data.grad is not None)
+
+    def test_backward_in_backward(self):
+        class cus_tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                temp = x.detach()
+                ctx.inputs = temp
+                return x.mean()
+
+            @staticmethod
+            def backward(ctx, dy):
+                with paddle.set_grad_enabled(True):
+                    temp = ctx.inputs
+                    temp.stop_gradient = False
+                    z = paddle.tanh(temp)
+                    z.backward()
+                    self.assertTrue(temp.grad is not None)
+                    return paddle.to_tensor(temp.grad)
+
+        for i in range(2):
+            data = paddle.ones([2, 3], dtype="float32") / (i + 1)
+            data.stop_gradient = False
+            data = paddle.nn.functional.relu(data)
+            z = paddle.tanh(data)
+            z = cus_tanh.apply(data)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_op.py b/python/paddle/fluid/tests/unittests/test_rnn_op.py
index 5ad2ffec98247b43f025ebc1e7bbfbb9598800d2..22e07b0bc48c0489ebee7e5b9faff3f859734f36 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py
@@ -47,8 +47,10 @@ class TestRNNOp(OpTest):
 
     def setUp(self):
         self.op_type = "rnn"
-        self.dtype = np.float64
-        self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32)
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.sequence_length = None if core.is_compiled_with_rocm(
+        ) else np.array(
+            [12, 11, 10, 9, 8], dtype=np.int32)
         self.num_layers = 1
         self.is_bidirec = False
         self.mode = "LSTM"
@@ -78,12 +80,31 @@ class TestRNNOp(OpTest):
             num_layers=self.num_layers,
             time_major=True,
             direction=direction,
-            dropout=self.dropout)
+            dropout=self.dropout,
+            dtype=self.dtype)
 
         flat_w = get_params_for_net(rnn1)
         output, (last_hidden, last_cell) = rnn1(
             input, sequence_length=self.sequence_length)
 
+        if core.is_compiled_with_rocm():
+
+            def rocm_rnn_get_place():
+                places = [core.CUDAPlace(0)]
+                return places
+
+            self._get_places = rocm_rnn_get_place
+
+            if self.is_bidirec:
+                for i in range(0, len(flat_w), 4):
+                    flat_w[i + 1], flat_w[i + 2] = flat_w[i + 2], flat_w[i + 1]
+
+            for i in range(len(flat_w)):
+                w = np.split(flat_w[i][1], 4, 0)
+                w = [w[0], w[1], w[3], w[2]]
+                w = np.concatenate(w)
+                flat_w[i] = (flat_w[i][0], w)
+
         init_h = np.zeros((self.num_layers * self.direction_num, batch_size,
                            hidden_size)).astype(self.dtype)
         init_c = np.zeros((self.num_layers * self.direction_num, batch_size,
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index 052704659b6ed7865006b7cf45a3cd56675a263d..c1ce032f506127e495dfd3231471fdabe6dfa26b 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -17,9 +17,11 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+from paddle.static import Program, program_guard
 
 
 class TestScaleOp(OpTest):
@@ -168,5 +170,45 @@ class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
             self.check_with_place(place, 'in', 'in')
 
 
+class TestScaleApiStatic(unittest.TestCase):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return paddle.scale(x, scale, bias)
+
+    def test_api(self):
+        paddle.enable_static()
+        input = np.random.random([2, 25]).astype("float32")
+        main_prog = Program()
+        with program_guard(main_prog, Program()):
+            x = paddle.static.data(name="x", shape=[2, 25], dtype="float32")
+            out = self._executed_api(x, scale=2.0, bias=3.0)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+        out = exe.run(main_prog, feed={"x": input}, fetch_list=[out])
+        self.assertEqual(np.array_equal(out[0], input * 2.0 + 3.0), True)
+
+
+class TestScaleInplaceApiStatic(TestScaleApiStatic):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return x.scale_(scale, bias)
+
+
+class TestScaleApiDygraph(unittest.TestCase):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return paddle.scale(x, scale, bias)
+
+    def test_api(self):
+        paddle.disable_static()
+        input = np.random.random([2, 25]).astype("float32")
+        x = paddle.to_tensor(input)
+        out = self._executed_api(x, scale=2.0, bias=3.0)
+        self.assertEqual(np.array_equal(out.numpy(), input * 2.0 + 3.0), True)
+        paddle.enable_static()
+
+
+class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return x.scale_(scale, bias)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
index 35bb4487c6aaee84c9e5c6691b4e3d5ec2f5ae07..59d1ede5a0b534fa726722ccf021bafeb2a313f8 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
@@ -242,7 +242,7 @@ class TestScatterNdOpRaise(unittest.TestCase):
                 output5 = fluid.layers.scatter_nd_add(ref5, index5, updates5)
             except Exception as e:
                 t = \
-                "Input(Index).shape[-1] should be no greater than Input(X).rank"
+                "The last dimension of Input(Index)'s shape should be no greater "
                 if t in str(e):
                     raise IndexError
 
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index 95ae1eecc6614170b8ccef33625c8f56a3a6c59b..e71adae8d9b6eb5cdfdbd648a3cfe653cfed3d3d 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -128,15 +128,18 @@ class TestSeluAPI(unittest.TestCase):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.selu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.selu, x_int32)
             # The scale must be greater than 1.0
-            x_fp32 = paddle.fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            x_fp32 = paddle.fluid.data(
+                name='x_fp32', shape=[12, 10], dtype='float32')
             self.assertRaises(ValueError, F.selu, x_fp32, -1.0)
             # The alpha must be no less than 0
             self.assertRaises(ValueError, F.selu, x_fp32, 1.6, -1.0)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.selu(x_fp16)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 23dac41f64abfe9a38c171f33049fd5943eaaf52..9534e4fe9541663c204c10a8bfab1b0696cbaac5 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -48,18 +48,37 @@ class TestSetValueBase(unittest.TestCase):
 
 
 class TestSetValueApi(TestSetValueBase):
-    def test_api(self):
+    def _run_static(self):
+        paddle.enable_static()
         with paddle.static.program_guard(self.program):
             x = paddle.ones(shape=self.shape, dtype=self.dtype)
             self._call_setitem(x)
 
         exe = paddle.static.Executor(paddle.CPUPlace())
         out = exe.run(self.program, fetch_list=[x])
+        paddle.disable_static()
+        return out
+
+    def _run_dynamic(self):
+        paddle.disable_static()
+        x = paddle.ones(shape=self.shape, dtype=self.dtype)
+        self._call_setitem(x)
+        out = x.numpy()
+        paddle.enable_static()
+        return out
+
+    def test_api(self):
+        static_out = self._run_static()
+        dynamic_out = self._run_dynamic()
         self._get_answer()
+
+        error_msg = "\nIn {} mode: \nExpected res = \n{}, \n\nbut received : \n{}"
         self.assertTrue(
-            (self.data == out).all(),
-            msg="\nExpected res = \n{}, \n\nbut received : \n{}".format(
-                self.data, out))
+            (self.data == static_out).all(),
+            msg=error_msg.format("static", self.data, static_out))
+        self.assertTrue(
+            (self.data == dynamic_out).all(),
+            msg=error_msg.format("dynamic", self.data, dynamic_out))
 
 
 # 1. Test different type of item: int, Python slice, Paddle Tensor
@@ -106,6 +125,23 @@ class TestSetValueItemSlice4(TestSetValueApi):
         self.data[0:, 1:2, :] = self.value
 
 
+class TestSetValueItemSliceInWhile(TestSetValueApi):
+    def _call_setitem(self, x):
+        def cond(i, x):
+            return i < 1
+
+        def body(i, x):
+            x[i] = self.value
+            i = i + 1
+            return i, x
+
+        i = paddle.zeros(shape=(1, ), dtype='int32')
+        i, x = paddle.fluid.layers.while_loop(cond, body, [i, x])
+
+    def _get_answer(self):
+        self.data[0] = self.value
+
+
 # 1.2.2 step > 1
 class TestSetValueItemSliceStep(TestSetValueApi):
     def set_shape(self):
@@ -671,6 +707,20 @@ class TestSetValueValueShape4(TestSetValueApi):
         self.data[0] = self.value
 
 
+class TestSetValueValueShape5(TestSetValueApi):
+    def set_value(self):
+        self.value = np.array([3, 3, 3]).astype(self.dtype)
+
+    def set_shape(self):
+        self.shape = [3, 4]
+
+    def _call_setitem(self, x):
+        x[:, 0] = paddle.assign(self.value)  # x is Paddle.Tensor
+
+    def _get_answer(self):
+        self.data[:, 0] = self.value
+
+
 # 4. Test error
 class TestError(TestSetValueBase):
     def _value_type_error(self):
@@ -717,6 +767,7 @@ class TestError(TestSetValueBase):
             exe.run(program)
 
     def test_error(self):
+        paddle.enable_static()
         with paddle.static.program_guard(self.program):
             self._value_type_error()
             self._dtype_error()
@@ -724,5 +775,76 @@ class TestError(TestSetValueBase):
         self._broadcast_mismatch()
 
 
+# 5. Test backward
+
+
+class Model(paddle.nn.Layer):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv = paddle.nn.Conv2D(12, 12, 3)
+
+    def forward(self, x, y):
+        x = self.conv(x)
+        y = self.conv(y)
+        var = y.flatten()
+
+        x[0, :, 0, 0] = var
+        loss = paddle.mean(x)
+        return loss, var, x
+
+
+class TestBackward(unittest.TestCase):
+    def test_static(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+
+        x_np = np.random.random(size=(4, 4)).astype('float32')
+        y_np = np.random.random(size=(4, 4)).astype('float32')
+        label_np = np.random.randint(2, size=(4, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
+            y = paddle.static.data(name="y", shape=[4, 4], dtype='float32')
+
+            label = paddle.static.data(
+                name="label", shape=[4, 1], dtype='int64')
+
+            z = paddle.add(x, y)
+            var = y[0, :]
+            z[0, :] = var
+
+            prediction = paddle.static.nn.fc(x=z, size=2, activation='softmax')
+
+            cost = paddle.nn.functional.cross_entropy(
+                input=prediction, label=label)
+            loss = paddle.mean(cost)
+            sgd = paddle.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        exe.run(startup_program)
+
+        var_grad, z_grad = exe.run(
+            main_program,
+            feed={"x": x_np,
+                  "y": y_np,
+                  "label": label_np},
+            fetch_list=[var.name + "@GRAD", z.name + "@GRAD"])
+
+        self.assertTrue((var_grad == z_grad[0, :]).all())
+
+    def test_dynamic(self):
+        paddle.disable_static()
+        model = Model()
+        x = paddle.ones([1, 12, 3, 3]).astype("float32")
+        y = paddle.ones([1, 12, 3, 3]).astype("float32")
+        loss, var, x = model(x, y)
+        loss.backward()
+
+        self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape)
+        self.assertTrue((var.grad == x.grad[0, :, 0, 0]).all())
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa8ff4effcfd372a71edd1f526fbfc82b679362f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -0,0 +1,209 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float)
+import paddle
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSGDOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = 'sgd'
+        self.dtype = np.uint16
+        self.conf()
+        w = np.random.random((self.h, self.w)).astype('float32')
+        w_bf16 = convert_float_to_uint16(w)
+        g = np.random.random((self.h, self.w)).astype('float32')
+        g_bf16 = convert_float_to_uint16(g)
+        lr = np.array([0.1]).astype('float32')
+        lr_bf16 = convert_float_to_uint16(lr)
+
+        self.inputs = {'Param': w_bf16, 'Grad': g_bf16, 'LearningRate': lr_bf16}
+        self.outputs = {'ParamOut': w - lr * g}
+
+    def conf(self):
+        self.h = 102
+        self.w = 105
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSGDOpCase8XBF16(TestSGDOpBF16):
+    def conf(self):
+        self.h = 10
+        self.w = 64
+
+
+class TestSparseSGDOpBF16(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        np.random.seed(12345)
+
+    def ref_optimize(self, params, grad_rows, grad_array, lr_value):
+        reference = np.copy(params)
+        for index, id in enumerate(grad_rows):
+            reference[id] = params[id] - lr_value * grad_array[index]
+        return reference
+
+    def check_output(self, actual_bf16, reference, atol=0, rtol=0.15e-2):
+        actual_fp32 = convert_uint16_to_float(actual_bf16)
+        np.testing.assert_allclose(actual_fp32, reference, atol=atol, rtol=rtol)
+
+    def create_sparse_grad_var(self, scope, place, height, rows, row_numel):
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        grad_array = np.random.random((len(rows), row_numel)).astype('float32')
+        np_array_bf16 = convert_float_to_uint16(grad_array)
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array_bf16, place)
+
+        return grad_tensor, grad_array
+
+    def create_dense_param_var(self, scope, place, height, width):
+        param_tensor = scope.var('Param').get_tensor()
+        param_array = np.random.random((height, width)).astype('float32')
+        param_array_bf16 = convert_float_to_uint16(param_array)
+        param_tensor.set(param_array_bf16, place)
+
+        return param_tensor, param_array
+
+    def create_sparse_param_var(self, scope, place, height, rows, row_numel):
+        param_selected_rows = scope.var('Param').get_selected_rows()
+        param_selected_rows.set_height(height)
+        param_selected_rows.set_rows(rows)
+        param_selected_rows.sync_index()
+        param_array = np.random.random((len(rows), row_numel)).astype('float32')
+        np_array_bf16 = convert_float_to_uint16(param_array)
+
+        param_tensor = param_selected_rows.get_tensor()
+        param_tensor.set(np_array_bf16, place)
+
+        return param_tensor, param_array
+
+    def create_dense_lr_var(self, scope, place):
+        lr_tensor = scope.var('LearningRate').get_tensor()
+        lr_value = np.random.uniform()
+        lr_array = np.full((1), lr_value, np.float32)
+        lr_array_bf16 = convert_float_to_uint16(lr_array)
+        lr_tensor.set(lr_array_bf16, place)
+
+        return lr_tensor, lr_value
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradSGDOpBF16(TestSparseSGDOpBF16):
+    def setUp(self):
+        self.setup_params()
+
+    def setup_params(self):
+        self.grad_height = 10
+        self.grad_rows = [0, 4, 7]
+        self.grad_row_numel = 12
+
+    def test_sparse_grad_sgd(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        _, grad_array = self.create_sparse_grad_var(
+            scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
+        param_tensor, param_array = self.create_dense_param_var(
+            scope, place, self.grad_height, self.grad_row_numel)
+        _, lr_value = self.create_dense_lr_var(scope, place)
+
+        sgd_op = Operator(
+            'sgd',
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
+                                      lr_value)
+        output = np.array(param_tensor)
+        self.check_output(output, reference, atol=5e-3, rtol=1e-1)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradSGDOpBF16Case2(TestSparseGradSGDOpBF16):
+    def setup_params(self):
+        self.grad_height = 14
+        self.grad_rows = [1, 4, 12, 7, 8]
+        self.grad_row_numel = 16
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):
+    def setUp(self):
+        self.setup_params()
+
+    def setup_params(self):
+        self.grad_height = 10
+        self.grad_rows = [0, 4, 7]
+        self.grad_row_numel = 12
+        self.param_rows = [a for a in range(self.grad_height)]
+
+    def test_sparse_param_grad_sgd(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        _, grad_array = self.create_sparse_grad_var(
+            scope, place, self.grad_height, self.grad_rows, self.grad_row_numel)
+        param_tensor, param_array = self.create_sparse_param_var(
+            scope, place, self.grad_height, self.param_rows,
+            self.grad_row_numel)
+        _, lr_value = self.create_dense_lr_var(scope, place)
+
+        sgd_op = Operator(
+            'sgd',
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        reference = self.ref_optimize(param_array, self.grad_rows, grad_array,
+                                      lr_value)
+        output = np.array(param_tensor)
+        self.check_output(output, reference, atol=5e-3, rtol=1e-1)
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16):
+    def setup_params(self):
+        self.grad_height = 14
+        self.grad_rows = [1, 4, 12, 7, 8]
+        self.grad_row_numel = 16
+        self.param_rows = [a for a in range(self.grad_height)]
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
index 85f9501e53f4ab3532d38af5f7779865dac40a11..2ef04d9cbfa73fe21a960e4af13cb7efdee316c7 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
@@ -42,8 +42,10 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(
+            name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         normalizer = None
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index 5bfc422da824079c42af20d764409472f4b117f8..e754999d5d2055dccd0ae7b565f1aa140309bcb6 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -56,7 +56,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.axis = -1
         self.ignore_index = -1
         self.shape = [41, 37]
-        self.softmax_switch = True
+        self.use_softmax = True
 
     def setUp(self):
         self.initParams()
@@ -77,7 +77,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
                              self.ignore_index)
 
-        if self.softmax_switch == False:
+        if self.use_softmax == False:
             self.inputs = {"Logits": softmax, "Label": labels}
         else:
             self.inputs = {"Logits": logits, "Label": labels}
@@ -90,7 +90,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
             "numeric_stable_mode": self.numeric_stable_mode,
             "soft_label": self.soft_label,
             "ignore_index": self.ignore_index,
-            "softmax_switch": self.softmax_switch,
+            "use_softmax": self.use_softmax,
         }
 
         if self.axis != -1:
@@ -116,8 +116,8 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_1D(
         self.shape = [13, 8]
         self.axis = -1
         self.ignore_index = -1
-        self.dtype = np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_1D(
@@ -129,8 +129,8 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_1D(
         self.shape = [13, 8]
         self.axis = -1
         self.ignore_index = -1
-        self.dtype = np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 ##############################################################################
@@ -145,8 +145,8 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D(
         self.shape = [3, 5, 7, 11]
         self.axis = -1
         self.ignore_index = -1
-        self.dtype = np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis2(
@@ -155,11 +155,11 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis2(
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = True
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.axis = 1
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis3(
@@ -168,11 +168,11 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis3(
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = True
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.axis = 2
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis4(
@@ -181,11 +181,11 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis4(
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = True
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.axis = 3
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 ##############################################################################
@@ -206,8 +206,8 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D(
         self.shape = [3, 5, 7, 11]
         self.axis = -1
         self.ignore_index = -1
-        self.dtype = np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis2(
@@ -216,11 +216,11 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis2(
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.axis = 1
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis3(
@@ -229,11 +229,11 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis3(
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.axis = 2
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis4(
@@ -242,11 +242,11 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis4(
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.axis = 3
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 ##############################################################################
@@ -267,8 +267,8 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_Ignore(
         self.shape = [13, 8]
         self.axis = -1
         self.ignore_index = 2
-        self.dtype = np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_Ignore_Axis(
@@ -280,8 +280,8 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_Ignore_Axis(
         self.shape = [13, 8]
         self.axis = 1
         self.ignore_index = 2
-        self.dtype = np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore(
@@ -293,8 +293,8 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore(
         self.shape = [3, 5, 7, 11]
         self.axis = -1
         self.ignore_index = 2
-        self.dtype = np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore_Axis3(
@@ -303,11 +303,11 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore_Axis3(
         self.op_type = "softmax_with_cross_entropy"
         self.numeric_stable_mode = True
         self.soft_label = False
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.axis = 2
         self.ignore_index = 2
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 ##############################################################################
@@ -324,7 +324,7 @@ class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
         self.axis = -1
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -403,7 +403,7 @@ class TestSoftmaxWithCrossEntropyOp2(TestSoftmaxWithCrossEntropyOp):
         self.axis = -1
         self.ignore_index = -1
         self.shape = [41, 37]
-        self.softmax_switch = True
+        self.use_softmax = True
 
     def test_check_output(self):
         self.check_output()
@@ -429,7 +429,7 @@ class TestSoftmaxWithCrossEntropyOp3(TestSoftmaxWithCrossEntropyOp):
         self.ignore_index = 5
         self.axis = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
@@ -441,7 +441,7 @@ class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
         self.ignore_index = 4
         self.axis = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxis1(TestSoftmaxWithCrossEntropyOp):
@@ -458,7 +458,7 @@ class TestSoftmaxWithCrossEntropyOpAxis1(TestSoftmaxWithCrossEntropyOp):
         self.axis = 0
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxis2(TestSoftmaxWithCrossEntropyOp):
@@ -475,7 +475,7 @@ class TestSoftmaxWithCrossEntropyOpAxis2(TestSoftmaxWithCrossEntropyOp):
         self.axis = 1
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxis3(TestSoftmaxWithCrossEntropyOp):
@@ -492,7 +492,7 @@ class TestSoftmaxWithCrossEntropyOpAxis3(TestSoftmaxWithCrossEntropyOp):
         self.axis = 2
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxis4(TestSoftmaxWithCrossEntropyOp):
@@ -509,7 +509,7 @@ class TestSoftmaxWithCrossEntropyOpAxis4(TestSoftmaxWithCrossEntropyOp):
         self.axis = 3
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxisDimEqualOne(
@@ -527,7 +527,7 @@ class TestSoftmaxWithCrossEntropyOpAxisDimEqualOne(
         self.axis = -1
         self.ignore_index = -1
         self.shape = [3, 5, 7, 1]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis1(
@@ -540,7 +540,7 @@ class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis1(
         self.axis = 0
         self.ignore_index = -1
         self.dtype = np.float16
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis2(
@@ -553,7 +553,7 @@ class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis2(
         self.axis = 1
         self.ignore_index = -1
         self.dtype = np.float16
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis3(
@@ -566,7 +566,7 @@ class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis3(
         self.axis = 2
         self.ignore_index = -1
         self.dtype = np.float16
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpSoftLabelAxis1(
@@ -579,7 +579,7 @@ class TestSoftmaxWithCrossEntropyOpSoftLabelAxis1(
         self.axis = 0
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpSoftLabelAxis2(
@@ -592,7 +592,7 @@ class TestSoftmaxWithCrossEntropyOpSoftLabelAxis2(
         self.axis = 1
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpSoftLabelAxis3(
@@ -605,7 +605,7 @@ class TestSoftmaxWithCrossEntropyOpSoftLabelAxis3(
         self.axis = 2
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpSoftLabelAxis4(
@@ -618,7 +618,7 @@ class TestSoftmaxWithCrossEntropyOpSoftLabelAxis4(
         self.axis = 3
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
@@ -631,7 +631,7 @@ class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
         self.ignore_index = 1
         self.axis = 0
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
@@ -644,7 +644,7 @@ class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
         self.ignore_index = 0
         self.axis = 1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
@@ -657,7 +657,7 @@ class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
         self.ignore_index = 3
         self.axis = 2
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
@@ -670,7 +670,7 @@ class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
         self.ignore_index = 3
         self.axis = 3
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpBoundary0(TestSoftmaxWithCrossEntropyOp):
@@ -688,7 +688,7 @@ class TestSoftmaxWithCrossEntropyOpBoundary0(TestSoftmaxWithCrossEntropyOp):
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.logits = np.full(self.shape, -500.0).astype(self.dtype)
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpBoundary1(TestSoftmaxWithCrossEntropyOp):
@@ -707,7 +707,7 @@ class TestSoftmaxWithCrossEntropyOpBoundary1(TestSoftmaxWithCrossEntropyOp):
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.logits = np.full(self.shape, 1000.0).astype(self.dtype)
         self.logits[:, :, 0, :] = -1000.0
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
deleted file mode 100644
index d8c57d964da706f12b8865195ea94329ca0f10e2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid.core as core
-import numpy as np
-from paddle.fluid.op import Operator
-
-
-class TestSpliteSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        return places
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
-    def test_check_grad(self):
-        for place in self.get_places():
-            self.check_grad_with_place(place)
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        rows = [0, 5, 7, 4, 20]
-        height = 21
-        row_numel = 2
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(rows)
-        x.set_height(height)
-        np_array = np.ones((len(rows), row_numel)).astype("float32")
-        np_array[0, 0] = 2.0
-        np_array[2, 1] = 4.0
-        np_array[4, 1] = 8.0
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        height_sections = [5, 5, 5, 5, 3]
-
-        # initialize output variables [out0, out1]
-        outs_name = ["out%d" % i for i in range(len(height_sections))]
-        outs = [
-            scope.var(var_name).get_selected_rows() for var_name in outs_name
-        ]
-
-        # expected output selected rows
-        expected_out0_rows = [0, 4]
-        expected_out1_rows = [0, 2]
-        expected_out2_rows = []
-        expected_out4_rows = [0]
-
-        op = Operator(
-            "split_selected_rows",
-            X="X",
-            Out=outs_name,
-            height_sections=height_sections)
-
-        op.run(scope, place)
-
-        self.assertEqual(outs[0].rows(), expected_out0_rows)
-        self.assertEqual(outs[1].rows(), expected_out1_rows)
-        self.assertEqual(outs[2].rows(), expected_out2_rows)
-        self.assertEqual(outs[4].rows(), expected_out4_rows)
-
-        self.assertEqual(outs[0].height(), height_sections[0])
-        self.assertEqual(outs[4].height(), height_sections[4])
-
-        self.assertAlmostEqual(2.0, np.array(outs[0].get_tensor())[0, 0])
-        self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
-        self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
-
-        self.assertEqual(outs[2].numel(), 0)
-        self.assertEqual(outs[3].numel(), 0)
-
-    def check_grad_with_place(self, place):
-        scope = core.Scope()
-        height = 10
-        row_numel = 2
-
-        # attr
-        height_sections = [5, 5]
-
-        # initialize input variable X
-        out0_grad = scope.var("out0@GRAD").get_selected_rows()
-        rows0 = [0, 5]
-        out0_grad.set_rows(rows0)
-        out0_grad.set_height(height)
-        out0_grad_tensor = out0_grad.get_tensor()
-        np_array = np.ones((len(rows0), row_numel)).astype("float32")
-        out0_grad_tensor.set(np_array, place)
-
-        out1_grad = scope.var("out1@GRAD").get_selected_rows()
-        rows1 = [2, 0]
-        out1_grad.set_rows(rows1)
-        out1_grad.set_height(height)
-        out1_grad_tensor = out1_grad.get_tensor()
-        np_array = np.ones((len(rows1), row_numel)).astype("float32")
-        out1_grad_tensor.set(np_array, place)
-
-        x_grad = scope.var("X@GRAD").get_selected_rows()
-
-        grad_op = Operator(
-            "sum",
-            X=["out0@GRAD", "out1@GRAD"],
-            Out="X@GRAD",
-            height_sections=height_sections)
-
-        grad_op.run(scope, place)
-
-        merged_rows = set(rows0 + rows1)
-        self.assertEqual(set(x_grad.rows()), set(rows0 + rows1))
-        self.assertEqual(x_grad.height(), height)
-
-        print(np.array(x_grad.get_tensor()))
-        self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
-        self.assertAlmostEqual(1.0, np.array(x_grad.get_tensor())[2, 1])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f2f7408262d94e2aa97e908fbac1057a900bc2e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_by_row.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+    def test_dist_static_model_parallel2(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_by_col.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+    def test_dist_static_model_parallel3(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_embedding.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index 51c543c5f7464f1c38ff9b152424ffd43b334e30..cfce0bb7d311bb5a225386ba7c664afdeae52d93 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -19,7 +19,7 @@ import unittest
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.dygraph.nn import Embedding
+from paddle.nn import Embedding
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Adam
 from paddle.fluid.dygraph.base import to_variable
@@ -31,6 +31,8 @@ import pickle
 import os
 import errno
 
+paddle.enable_static()
+
 
 class SimpleLSTMRNN(fluid.Layer):
     def __init__(self,
@@ -159,11 +161,10 @@ class PtbModel(fluid.Layer):
             num_layers=num_layers,
             init_scale=init_scale,
             dropout=dropout)
-        self.embedding = Embedding(
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
+        self.embedding = paddle.nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=hidden_size,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale)))
@@ -187,6 +188,8 @@ class PtbModel(fluid.Layer):
         init_c = fluid.layers.reshape(
             init_cell, shape=[self.num_layers, -1, self.hidden_size])
 
+        # NPU 'tok_k' kernel only support `int32` dtype, so cast `input` from `int64` to `int32`.
+        input = fluid.layers.cast(input, "int32")
         x_emb = self.embedding(input)
         x_emb = fluid.layers.reshape(
             x_emb, shape=[-1, self.num_steps, self.hidden_size])
@@ -214,6 +217,10 @@ class PtbModel(fluid.Layer):
 
 
 class TestSaveLoadBase(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -235,8 +242,7 @@ class TestSaveLoadBase(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -315,6 +321,10 @@ class TestSaveLoadBase(unittest.TestCase):
 
 
 class TestSaveLoadPartial(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -336,8 +346,7 @@ class TestSaveLoadPartial(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -425,6 +434,10 @@ class TestSaveLoadPartial(unittest.TestCase):
 
 
 class TestSaveLoadSetStateDict(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -446,8 +459,7 @@ class TestSaveLoadSetStateDict(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -526,6 +538,10 @@ class TestSaveLoadSetStateDict(unittest.TestCase):
 
 
 class TestProgramStatePartial(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -547,8 +563,7 @@ class TestProgramStatePartial(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -708,14 +723,17 @@ class TestProgramStatePartial(unittest.TestCase):
 
 
 class TestVariableInit(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_variable_init(self):
 
         x = fluid.data(name="x", shape=[10, 10], dtype='float32')
         y = fluid.layers.fc(x, 10)
         z = fluid.layers.fc(y, 10)
 
-        place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        place = self.set_place()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
@@ -738,8 +756,7 @@ class TestVariableInit(unittest.TestCase):
         program = fluid.default_main_program()
         new_scope = fluid.core.Scope()
 
-        place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        place = self.set_place()
         exe = fluid.Executor(place)
         parameter_list = list(
             filter(fluid.io.is_parameter, program.list_vars()))
@@ -798,6 +815,10 @@ class TestLoadFromOldInterface(unittest.TestCase):
         if os.path.exists("test_static_load_var_list.pdparams"):
             os.remove("test_static_load_var_list.pdparams")
 
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_load_from_old_interface(self):
         seed = 90
         hidden_size = 10
@@ -819,8 +840,7 @@ class TestLoadFromOldInterface(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -935,8 +955,7 @@ class TestLoadFromOldInterface(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -1027,6 +1046,10 @@ class TestLoadFromOldInterface(unittest.TestCase):
 
 
 class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_load_from_old_interface(self):
         seed = 90
         hidden_size = 10
@@ -1048,8 +1071,7 @@ class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -1171,6 +1193,13 @@ class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
 
 
 class TestProgramStateOldSave(unittest.TestCase):
+    def setUp(self):
+        self.test_dygraph = True
+
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -1192,8 +1221,7 @@ class TestProgramStateOldSave(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -1299,11 +1327,12 @@ class TestProgramStateOldSave(unittest.TestCase):
             fluid.set_program_state(main_program, program_state)
             self.check_in_static(main_program, base_map)
 
-        # make sure `load_program_state` can be used in dynamic graph mode
-        with fluid.dygraph.guard(place):
-            load_state = fluid.load_program_state("test_program_1")
-            for k, v in load_state.items():
-                self.assertTrue(np.array_equal(base_map[k], v))
+        if self.test_dygraph:
+            # make sure `load_program_state` can be used in dynamic graph mode
+            with fluid.dygraph.guard(place):
+                load_state = fluid.load_program_state("test_program_1")
+                for k, v in load_state.items():
+                    self.assertTrue(np.array_equal(base_map[k], v))
 
     def create_symlink(self, target, link_name):
         try:
@@ -1323,6 +1352,10 @@ class TestProgramStateOldSave(unittest.TestCase):
 
 
 class TestProgramStateOldSaveSingleModel(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -1344,8 +1377,7 @@ class TestProgramStateOldSaveSingleModel(unittest.TestCase):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
index 08413d711be55b0b7072b3d2dfd0dc5efdeb8462..c5dc98af5c8f6d0dded8700bbdcdbbc325989066 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
@@ -25,12 +25,17 @@ import six
 import pickle
 import os
 
+# Python2.x no longer supports saving and loading large parameters.
+if six.PY2:
+    LARGE_PARAM = 2
+else:
+    LARGE_PARAM = 2**26
+
 
 class TestStaticSaveLoadLargeParameters(unittest.TestCase):
     def test_large_parameters_static_save(self):
         # enable static mode
         paddle.enable_static()
-        LARGE_PARAM = 2**26
         with new_program_scope():
             # create network
             x = paddle.static.data(
@@ -54,7 +59,11 @@ class TestStaticSaveLoadLargeParameters(unittest.TestCase):
 
             path = os.path.join("test_static_save_load_large_param",
                                 "static_save")
-            paddle.fluid.save(prog, path)
+            if six.PY2:
+                protocol = 2
+            else:
+                protocol = 4
+            paddle.fluid.save(prog, path, pickle_protocol=protocol)
             # set var to zero
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
@@ -92,3 +101,7 @@ class TestStaticSaveLoadLargeParameters(unittest.TestCase):
                                      .get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 35dc92ffb08c64e19a14397e33cf2231794d6122..f9e40cf8133d70651445047cbf769e54d11a97cf 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -18,9 +18,12 @@ import unittest
 import numpy as np
 from op_test import OpTest
 import paddle
+from paddle import enable_static
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float)
 
 
 class TestSumOp(OpTest):
@@ -141,6 +144,73 @@ class TestSelectedRowsSumOp(unittest.TestCase):
                 self.check_with_place(place, inplace)
 
 
+class TestSelectedRowsSumOpInt(TestSelectedRowsSumOp):
+    def init_kernel_type(self):
+        self.dtype = np.int32
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp):
+    def setUp(self):
+        self.height = 10
+        self.row_numel = 12
+        self.rows = [0, 1, 2, 3, 4, 5, 6]
+        self.dtype = np.uint16
+        self.init_kernel_type()
+        np.random.seed(12345)
+        self.data = np.random.random((len(self.rows),
+                                      self.row_numel)).astype(np.float32)
+
+    def _get_array(self, rows, row_numel):
+        if len(rows) > 0:
+            return convert_float_to_uint16(self.data)
+        else:
+            return np.ndarray((0, row_numel), dtype=self.dtype)
+
+    def check_input_and_optput(self,
+                               scope,
+                               place,
+                               inplace,
+                               w1_has_data=False,
+                               w2_has_data=False,
+                               w3_has_data=False):
+
+        self.create_selected_rows(scope, place, "W1", w1_has_data)
+        self.create_selected_rows(scope, place, "W2", w2_has_data)
+        self.create_selected_rows(scope, place, "W3", w3_has_data)
+
+        # create Out Variable
+        if inplace:
+            out_var_name = "W1"
+        else:
+            out_var_name = "Out"
+        out = scope.var(out_var_name).get_selected_rows()
+
+        # create and run sum operator
+        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
+        sum_op.run(scope, place)
+
+        has_data_w_num = 0
+        for has_data in [w1_has_data, w2_has_data, w3_has_data]:
+            if has_data:
+                has_data_w_num += 1
+
+        if has_data_w_num > 0:
+            self.assertEqual(len(out.rows()), 7)
+            out_bf16 = np.array(out.get_tensor())
+            out_fp32 = convert_uint16_to_float(out_bf16)
+            ref_fp32 = convert_uint16_to_float(
+                self._get_array(self.rows, self.row_numel)) * has_data_w_num
+            np.testing.assert_allclose(out_fp32, ref_fp32, atol=0, rtol=0.95e-2)
+        else:
+            self.assertEqual(len(out.rows()), 0)
+
+    def test_w_is_selected_rows(self):
+        for inplace in [True, False]:
+            self.check_with_place(core.CPUPlace(), inplace)
+
+
 class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
     def setUp(self):
         self.height = 10
@@ -324,4 +394,5 @@ create_test_sum_fp16_class(TestSelectedRowsSumOp)
 create_test_sum_fp16_class(TestLoDTensorAndSelectedRowsOp)
 
 if __name__ == "__main__":
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 4649323b5b395a9c0aaa34415f7f47c177667541..13aa7d3d37dd4f5253acc04661dce09cb6925435 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -50,7 +50,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
     def setUp(self):
         """Setup."""
         #self.dtype = np.float32
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.N = 8
         self.C = 16
         self.H = 32
@@ -92,7 +92,10 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
                     moving_variance_name='bn_moving_variance',
                     data_layout=layout,
                     is_test=only_forward)
-                bn = fluid.layers.cast(bn, 'float64')
+                if core.is_compiled_with_rocm():
+                    bn = fluid.layers.cast(bn, 'float32')
+                else:
+                    bn = fluid.layers.cast(bn, 'float64')
                 sigmoid = fluid.layers.sigmoid(bn)
                 out = fluid.layers.reduce_sum(sigmoid)
                 if not sync_bn:
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index 050c38e5499be8628ebefbc1d13aba81dde195c6..5bab4a52bf05a94d08b5acf411b1375cda2e8590 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -22,7 +22,9 @@ import paddle
 from paddle.fluid import core
 
 
-def temporal_shift(x, seg_num, shift_ratio):
+def temporal_shift(x, seg_num, shift_ratio, data_format):
+    if data_format == "NHWC":
+        x = np.transpose(x, (0, 3, 1, 2))
     shape = x.shape
     reshape_x = x.reshape((-1, seg_num, shape[1], shape[2], shape[3]))
     pad_x = np.pad(reshape_x, ((0, 0), (1, 1), (0, 0), (0, 0), (0, 0)),
@@ -33,7 +35,10 @@ def temporal_shift(x, seg_num, shift_ratio):
     slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :]
     slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :]
     concat_x = np.concatenate([slice1, slice2, slice3], axis=2)
-    return concat_x.reshape(shape)
+    out = concat_x.reshape(shape)
+    if data_format == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))
+    return out
 
 
 class TestTemporalShift(OpTest):
@@ -45,11 +50,13 @@ class TestTemporalShift(OpTest):
         self.attrs = {
             "seg_num": self.seg_num,
             "shift_ratio": self.shift_ratio,
+            "data_format": self.data_format
         }
 
         self.inputs = {"X": x, }
 
-        output = temporal_shift(x, self.seg_num, self.shift_ratio)
+        output = temporal_shift(x, self.seg_num, self.shift_ratio,
+                                self.data_format)
         self.outputs = {"Out": output}
 
     def test_check_output(self):
@@ -63,6 +70,7 @@ class TestTemporalShift(OpTest):
         self.seg_num = 3
         self.shift_ratio = 0.25
         self.dtype = 'float64'
+        self.data_format = 'NCHW'
 
 
 class TestTemporalShift2(TestTemporalShift):
@@ -70,6 +78,7 @@ class TestTemporalShift2(TestTemporalShift):
         self.x_shape = (4, 9, 7, 7)
         self.seg_num = 2
         self.shift_ratio = 0.2
+        self.data_format = 'NCHW'
 
 
 class TestTemporalShift3(TestTemporalShift):
@@ -77,6 +86,15 @@ class TestTemporalShift3(TestTemporalShift):
         self.x_shape = (3, 10, 5, 5)
         self.seg_num = 1
         self.shift_ratio = 0.3
+        self.data_format = 'NCHW'
+
+
+class TestTemporalShift4(TestTemporalShift):
+    def initTestCase(self):
+        self.x_shape = (6, 5, 5, 4)
+        self.seg_num = 3
+        self.shift_ratio = 0.25
+        self.data_format = 'NHWC'
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -87,6 +105,7 @@ class TestTemporalShiftFP16(TestTemporalShift):
         self.seg_num = 1
         self.shift_ratio = 0.3
         self.dtype = 'float16'
+        self.data_format = 'NCHW'
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
@@ -114,6 +133,14 @@ class TestTemporalShiftAPI(unittest.TestCase):
             out = paddle.nn.functional.temporal_shift(
                 x=input, seg_num=2, shift_ratio=0.2)
 
+    def test_error(self):
+        def attr_data_format():
+            input = paddle.randn([6, 4, 2, 2])
+            out = paddle.nn.functional.temporal_shift(
+                x=input, seg_num=2, shift_ratio=0.2, data_format="HWC")
+
+        self.assertRaises(ValueError, attr_data_format)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..52256766fed7585cc5815e636ecff8403d382c5e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -0,0 +1,548 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+
+class SimpleNet(nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(SimpleNet, self).__init__()
+        self.linear1 = nn.Linear(in_size, in_size)
+        self.linear2 = nn.Linear(in_size, out_size)
+
+    def forward(self, x, hook=None, register=False, remove=False):
+        ret1 = self.linear1(x)
+        if hook is not None:
+            if register:
+                h = ret1.register_hook(hook)
+                if remove:
+                    h.remove()
+        ret2 = self.linear2(ret1)
+        out = paddle.mean(ret2, axis=-1)
+        return ret1, out
+
+
+class SimpleNetForStatic(nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(SimpleNetForStatic, self).__init__()
+        self.linear1 = nn.Linear(in_size, in_size)
+        self.linear2 = nn.Linear(in_size, out_size)
+
+    def forward(self, x):
+        ret1 = self.linear1(x)
+        ret1.register_hook(lambda grad: grad * 2)
+
+        ret2 = self.linear2(ret1)
+        out = paddle.mean(ret2, axis=-1)
+        return out
+
+
+class TestTensorRegisterHook(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.in_size = 10
+        self.out_size = 10
+        self.batch_size = 4
+        self.devices = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            self.devices.append("gpu")
+
+    def test_hook_for_interior_var(self):
+        def run_double_hook_for_interior_var(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+
+                w = x + y
+                w.stop_gradient = False
+                helper = w.register_hook(double_hook)
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # z.grad is not affected
+                self.assertTrue(np.array_equal(z.grad.numpy(), w.numpy()))
+                # w.grad is not changed by hook
+                self.assertTrue(np.array_equal(w.grad.numpy(), z.numpy()))
+                # x.grad and y.grad are changed if run hook
+                self.assertTrue(
+                    np.array_equal(x.grad.numpy(),
+                                   z.numpy() * 2 if not removed else z.numpy()))
+                self.assertTrue(
+                    np.array_equal(y.grad.numpy(),
+                                   z.numpy() * 2 if not removed else z.numpy()))
+
+        def run_print_hook_for_interior_var(print_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+
+                w = x + y
+                w.stop_gradient = False
+                helper = w.register_hook(print_hook)
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # all grads are not affected
+                self.assertTrue(np.array_equal(z.grad.numpy(), w.numpy()))
+                self.assertTrue(np.array_equal(w.grad.numpy(), z.numpy()))
+                self.assertTrue(np.array_equal(x.grad.numpy(), z.numpy()))
+                self.assertTrue(np.array_equal(y.grad.numpy(), z.numpy()))
+
+        def double_hook(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
+
+        def print_hook(grad):
+            print(grad)
+
+        # register hook
+        run_double_hook_for_interior_var(double_hook)
+        # register hook and removed
+        run_double_hook_for_interior_var(double_hook, removed=True)
+
+        # register hook
+        run_double_hook_for_interior_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_interior_var(lambda grad: grad * 2, removed=True)
+
+        # register hook
+        run_print_hook_for_interior_var(print_hook)
+        # register hook and removed
+        run_print_hook_for_interior_var(print_hook, removed=True)
+
+    def test_hook_for_leaf_var(self):
+        def run_double_hook_for_leaf_var(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+                helper = y.register_hook(double_hook)
+
+                w = x + y
+                w.stop_gradient = False
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # z.grad, w.grad, x.grad is not affected
+                self.assertTrue(np.array_equal(z.grad.numpy(), w.numpy()))
+                self.assertTrue(np.array_equal(w.grad.numpy(), z.numpy()))
+                self.assertTrue(np.array_equal(x.grad.numpy(), z.numpy()))
+                # y.grad are changed if run hook
+                self.assertTrue(
+                    np.array_equal(y.grad.numpy(),
+                                   z.numpy() * 2 if not removed else z.numpy()))
+
+        # register hook
+        run_double_hook_for_leaf_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
+
+    def test_hook_for_accumulated_grad_interior_var(self):
+        def run_double_hook_for_accumulated_grad_interior_var(double_hook,
+                                                              removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                a = paddle.to_tensor([0., 1., 1., 2.])
+                b = paddle.to_tensor([0., 0., 1., 2.])
+                a.stop_gradient = False
+                b.stop_gradient = False
+
+                helper1 = a.register_hook(double_hook)
+
+                x = a + b
+                x.stop_gradient = False
+
+                helper2 = x.register_hook(double_hook)
+
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                y.stop_gradient = False
+                z.stop_gradient = False
+
+                o1 = x + y
+                o2 = x + z
+                o1.stop_gradient = False
+                o2.stop_gradient = False
+
+                o = o1.matmul(o2)
+
+                # remove hook before backward
+                if removed:
+                    helper1.remove()
+                    helper2.remove()
+
+                o.backward()
+
+                base_grad = np.array([5., 9., 13., 19.])
+                # x.grad is not changed
+                self.assertTrue(np.array_equal(x.grad.numpy(), base_grad))
+                # b.grad is changed by x.hook
+                self.assertTrue(
+                    np.array_equal(b.grad.numpy(), base_grad * 2
+                                   if not removed else base_grad))
+                # a.grad is changed by x.hook and a.hook
+                self.assertTrue(
+                    np.array_equal(a.grad.numpy(), base_grad * 4
+                                   if not removed else base_grad))
+
+        # register hook
+        run_double_hook_for_accumulated_grad_interior_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_accumulated_grad_interior_var(
+            lambda grad: grad * 2, removed=True)
+
+    def test_hook_for_accumulated_grad_leaf_var(self):
+        def run_double_hook_for_accumulated_grad_leaf_var(double_hook,
+                                                          removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 4.])
+                x.stop_gradient = False
+
+                helper = x.register_hook(double_hook)
+
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                y.stop_gradient = False
+                z.stop_gradient = False
+
+                o1 = x + y
+                o2 = x + z
+                o1.stop_gradient = False
+                o2.stop_gradient = False
+
+                o = o1.matmul(o2)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                base_grad = np.array([5., 9., 13., 19.])
+                # x.grad is changed by x.hook
+                self.assertTrue(
+                    np.array_equal(x.grad.numpy(), base_grad * 2
+                                   if not removed else base_grad))
+
+        # register hook
+        run_double_hook_for_accumulated_grad_leaf_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_accumulated_grad_leaf_var(
+            lambda grad: grad * 2, removed=True)
+
+    def test_hook_in_model(self):
+        def run_double_hook_in_model(data,
+                                     label,
+                                     hook=None,
+                                     register=False,
+                                     remove=False):
+            for device in self.devices:
+                paddle.seed(self.seed)
+                paddle.set_device(device)
+
+                net = SimpleNet(self.in_size, self.out_size)
+                loss_fn = nn.MSELoss()
+
+                data = paddle.to_tensor(data)
+                label = paddle.to_tensor(label)
+
+                ret1, out = net(data, hook, register, remove)
+                loss = loss_fn(out, label)
+                loss.backward()
+
+                return (ret1.grad.numpy(), net.linear1.weight.grad.numpy(),
+                        net.linear1.bias.grad.numpy())
+
+        data = np.random.uniform(
+            size=[self.batch_size, self.in_size]).astype('float32')
+        label = np.random.uniform(size=[self.batch_size, 1]).astype('float32')
+
+        # get original value
+        ret1_grad, linear1_w_grad, linear1_b_grad = run_double_hook_in_model(
+            data, label)
+        # get value changed by hook
+        ret1_grad_hook, linear1_w_grad_hook, linear1_b_grad_hook = run_double_hook_in_model(
+            data, label, lambda grad: grad * 2, True)
+        # get value after removing hook
+        ret1_grad_rm, linear1_w_grad_rm, linear1_b_grad_rm = run_double_hook_in_model(
+            data, label, lambda grad: grad * 2, True, True)
+
+        # compare original value and with hook
+        self.assertTrue(np.array_equal(ret1_grad, ret1_grad_hook))
+        self.assertTrue(np.array_equal(linear1_w_grad * 2, linear1_w_grad_hook))
+        self.assertTrue(np.array_equal(linear1_b_grad * 2, linear1_b_grad_hook))
+
+        # compare original value and remove hook
+        self.assertTrue(np.array_equal(ret1_grad, ret1_grad_rm))
+        self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm))
+        self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm))
+
+    def test_multiple_hooks_for_interior_var(self):
+        def run_multiple_hooks_for_interior_var(device,
+                                                hooks,
+                                                remove1=False,
+                                                remove2=False,
+                                                remove3=False):
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([0., 1., 2., 3.])
+            y = paddle.to_tensor([4., 5., 6., 7.])
+            x.stop_gradient = False
+            y.stop_gradient = False
+
+            w = x + y
+            w.stop_gradient = False
+
+            helpers = []
+            for hook in hooks:
+                helper = w.register_hook(hook)
+                helpers.append(helper)
+
+            z = paddle.to_tensor([1., 2., 3., 4.])
+            z.stop_gradient = False
+
+            o = z.matmul(w)
+
+            if remove1:
+                helpers[0].remove()
+            if remove2:
+                helpers[1].remove()
+            if remove3:
+                helpers[2].remove()
+
+            o.backward()
+
+            return z.numpy(), w.grad.numpy(), x.grad.numpy(), y.grad.numpy()
+
+        def double_hook(grad):
+            return grad * 2
+
+        hooks = [double_hook, double_hook, double_hook]
+
+        for device in self.devices:
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 8))
+            self.assertTrue(np.array_equal(y_grad, z * 8))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove1=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove2=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove3=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove1=True, remove2=True, remove3=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z))
+            self.assertTrue(np.array_equal(y_grad, z))
+
+    def test_hook_in_double_grad(self):
+        def double_print_hook(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
+
+        x = paddle.ones(shape=[1], dtype='float32')
+        x.stop_gradient = False
+
+        # hook only works in backward
+        # for forward var x, the x.grad generated in
+        # paddle.grad will not deal with by hook
+        x.register_hook(double_print_hook)
+
+        y = x * x
+
+        # Since y = x * x, dx = 2 * x
+        dx = paddle.grad(
+            outputs=[y], inputs=[x], create_graph=True, retain_graph=True)[0]
+
+        z = y + dx
+        self.assertTrue(x.grad is None)
+
+        # If create_graph = True, the gradient of dx
+        # would be backpropagated. Therefore,
+        # z = x * x + dx = x * x + 2 * x, and
+        # x.gradient() = 2 * x + 2 = 4.0
+        # after changed by hook: 8.0
+
+        z.backward()
+        self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.])))
+
+    def test_remove_one_hook_multiple_times(self):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([1., 2., 3., 4.])
+            x.stop_gradient = False
+
+            h = x.register_hook(lambda grad: grad * 2)
+            self.assertTrue(h.remove())
+            self.assertFalse(h.remove())
+
+    def test_register_hook_for_stop_gradient_var(self):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([1., 2., 3., 4.])
+
+            with self.assertRaises(RuntimeError):
+                x.register_hook(lambda grad: grad * 2)
+
+    def test_register_hook_in_static_mode(self):
+        paddle.enable_static()
+
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.scope_guard(paddle.static.Scope()):
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.static.data(
+                    name='x', shape=[None, self.in_size], dtype='float32')
+
+                net = SimpleNetForStatic(self.in_size, self.out_size)
+                with self.assertRaises(AssertionError):
+                    out = net(x)
+
+        paddle.disable_static()
+
+    def test_register_hook_in_dy2static_mode(self):
+        net = SimpleNetForStatic(self.in_size, self.out_size)
+        jit_net = paddle.jit.to_static(
+            net, input_spec=[paddle.static.InputSpec([None, self.in_size])])
+
+        data = np.random.uniform(
+            size=[self.batch_size, self.in_size]).astype('float32')
+        data_t = paddle.to_tensor(data)
+
+        with self.assertRaises(AssertionError):
+            out = jit_net(data_t)
+
+
+HOOK_INIT_VALUE = 10
+HOOK_IS_CALLED = False
+
+
+def global_void_hook():
+    global HOOK_INIT_VALUE
+    global HOOK_IS_CALLED
+    HOOK_INIT_VALUE *= 2
+    HOOK_IS_CALLED = True
+
+
+class TestTensorRegisterBackwardHook(unittest.TestCase):
+    def setUp(self):
+        self.devices = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            self.devices.append("gpu")
+
+    def test_register_backward_hook(self):
+        global HOOK_INIT_VALUE
+        global HOOK_IS_CALLED
+        for device in self.devices:
+            x = paddle.to_tensor(5., stop_gradient=False)
+            x._register_backward_hook(global_void_hook)
+            for i in range(5):
+                y = paddle.pow(x, 4.0)
+                y.backward()
+
+            self.assertEqual(HOOK_INIT_VALUE, 320)
+            self.assertTrue(HOOK_IS_CALLED)
+
+            # reset initial value
+            HOOK_INIT_VALUE = 10
+            HOOK_IS_CALLED = False
+
+    def test_register_backward_hook_for_interior_var(self):
+        x = paddle.to_tensor(5., stop_gradient=False)
+        y = paddle.pow(x, 4.0)
+
+        with self.assertRaises(ValueError):
+            y._register_backward_hook(global_void_hook)
+
+    def test_register_backward_hook_for_var_without_gradient(self):
+        x = paddle.to_tensor(5.)
+        y = paddle.pow(x, 4.0)
+
+        with self.assertRaises(ValueError):
+            x._register_backward_hook(global_void_hook)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_to_list.py b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..73b91297e6fd62e083b340dffa72ddddaf16863c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import six
+import paddle
+
+
+class TensorToListTest(unittest.TestCase):
+    def setUp(self):
+        self.shape = [11, 25, 32, 43]
+
+    def test_tensor_tolist(self):
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+            places.append(fluid.CUDAPinnedPlace())
+
+        for p in places:
+            np_arr = np.reshape(
+                np.array(six.moves.range(np.prod(self.shape))), self.shape)
+            expectlist = np_arr.tolist()
+
+            t = paddle.to_tensor(np_arr, place=p)
+            tensorlist = t.tolist()
+
+            self.assertEqual(tensorlist, expectlist)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
index 38543fecac85ef246600e1b8593af664b6fe13f2..cb5186468890d8108042faba56f16d641adb663e 100644
--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
@@ -18,6 +18,7 @@ import paddle.fluid as fluid
 import six
 import unittest
 import paddle.nn as nn
+import os
 
 
 class SimpleFCLayer(nn.Layer):
@@ -115,36 +116,41 @@ class TestTracedLayerErrMsg(unittest.TestCase):
             dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                 self.layer, [in_x])
 
-            dirname = './traced_layer_err_msg'
+            path = './traced_layer_err_msg'
             with self.assertRaises(TypeError) as e:
                 traced_layer.save_inference_model([0])
             self.assertEqual(
-                "The type of 'dirname' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ".
+                "The type of 'path' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ".
                 format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, [0], [None])
+                traced_layer.save_inference_model(path, [0], [None])
             self.assertEqual(
                 "The type of 'each element of fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".
                 format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, [0], False)
+                traced_layer.save_inference_model(path, [0], False)
             self.assertEqual(
                 "The type of 'fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".
                 format(self.type_str, self.type_str, self.type_str),
                 str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, [None], [0])
+                traced_layer.save_inference_model(path, [None], [0])
             self.assertEqual(
                 "The type of 'each element of feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".
                 format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, True, [0])
+                traced_layer.save_inference_model(path, True, [0])
             self.assertEqual(
                 "The type of 'feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".
                 format(self.type_str, self.type_str, self.type_str),
                 str(e.exception))
+            with self.assertRaises(ValueError) as e:
+                traced_layer.save_inference_model("")
+            self.assertEqual(
+                "The input path MUST be format of dirname/file_prefix [dirname\\file_prefix in Windows system], "
+                "but received file_prefix is empty string.", str(e.exception))
 
-            traced_layer.save_inference_model(dirname)
+            traced_layer.save_inference_model(path)
 
     def _train_simple_net(self):
         layer = None
@@ -174,5 +180,25 @@ class TestOutVarWithNoneErrMsg(unittest.TestCase):
                                                                         [in_x])
 
 
+class TestTracedLayerSaveInferenceModel(unittest.TestCase):
+    """test save_inference_model will automaticlly create non-exist dir"""
+
+    def setUp(self):
+        self.save_path = "./nonexist_dir/fc"
+        import shutil
+        if os.path.exists(os.path.dirname(self.save_path)):
+            shutil.rmtree(os.path.dirname(self.save_path))
+
+    def test_mkdir_when_input_path_non_exist(self):
+        fc_layer = SimpleFCLayer(3, 4, 2)
+        input_var = paddle.to_tensor(np.random.random([4, 3]).astype('float32'))
+        with fluid.dygraph.guard():
+            dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
+                fc_layer, inputs=[input_var])
+            self.assertFalse(os.path.exists(os.path.dirname(self.save_path)))
+            traced_layer.save_inference_model(self.save_path)
+            self.assertTrue(os.path.exists(os.path.dirname(self.save_path)))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpiler_ops.py b/python/paddle/fluid/tests/unittests/test_transpiler_ops.py
deleted file mode 100644
index 9512ae495d8b61e4710b762916c6b7d138140896..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_transpiler_ops.py
+++ /dev/null
@@ -1,143 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import traceback
-import math
-import collections
-
-import six
-import unittest
-import numpy as np
-
-import gc
-
-gc.set_debug(gc.DEBUG_COLLECTABLE)
-
-import paddle.fluid as fluid
-from test_dist_transpiler import TranspilerTest
-
-
-class TestFakeInit(TranspilerTest):
-    def net_conf(self):
-        dict_size, embedding_size, neg_num = 10000, 8, 5
-
-        input_word = fluid.layers.data(
-            name="input_word", shape=[1], dtype='int64', lod_level=1)
-        true_word = fluid.layers.data(
-            name='true_label', shape=[1], dtype='int64', lod_level=1)
-        neg_word = fluid.layers.data(
-            name="neg_label", shape=[1], dtype='int64', lod_level=1)
-        inputs = [input_word, true_word, neg_word]
-
-        init_width = 0.5 / embedding_size
-        input_emb = fluid.layers.embedding(
-            input=inputs[0],
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb',
-                initializer=fluid.initializer.Uniform(-init_width, init_width)))
-
-        true_emb_w = fluid.layers.embedding(
-            input=inputs[1],
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb_w',
-                initializer=fluid.initializer.Constant(value=0.0)))
-
-        true_emb_b = fluid.layers.embedding(
-            input=inputs[1],
-            is_sparse=True,
-            size=[dict_size, 1],
-            param_attr=fluid.ParamAttr(
-                name='emb_b',
-                initializer=fluid.initializer.Constant(value=0.0)))
-
-        neg_word_reshape = fluid.layers.reshape(inputs[2], shape=[-1, 1])
-        neg_word_reshape.stop_gradient = True
-
-        neg_emb_w = fluid.layers.embedding(
-            input=neg_word_reshape,
-            is_sparse=True,
-            size=[dict_size, embedding_size],
-            param_attr=fluid.ParamAttr(
-                name='emb_w', learning_rate=1.0))
-
-        neg_emb_w_re = fluid.layers.reshape(
-            neg_emb_w, shape=[-1, neg_num, embedding_size])
-
-        neg_emb_b = fluid.layers.embedding(
-            input=neg_word_reshape,
-            is_sparse=True,
-            size=[dict_size, 1],
-            param_attr=fluid.ParamAttr(
-                name='emb_b', learning_rate=1.0))
-
-        neg_emb_b_vec = fluid.layers.reshape(neg_emb_b, shape=[-1, neg_num])
-
-        true_logits = fluid.layers.elementwise_add(
-            fluid.layers.reduce_sum(
-                fluid.layers.elementwise_mul(input_emb, true_emb_w),
-                dim=1,
-                keep_dim=True),
-            true_emb_b)
-
-        input_emb_re = fluid.layers.reshape(
-            input_emb, shape=[-1, 1, embedding_size])
-
-        neg_matmul = fluid.layers.matmul(
-            input_emb_re, neg_emb_w_re, transpose_y=True)
-        neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num])
-        neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
-        # nce loss
-        label_ones = fluid.layers.fill_constant_batch_size_like(
-            true_logits, shape=[-1, 1], value=1.0, dtype='float32')
-        label_zeros = fluid.layers.fill_constant_batch_size_like(
-            true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
-
-        true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
-                                                                   label_ones)
-        neg_xent = fluid.layers.sigmoid_cross_entropy_with_logits(neg_logits,
-                                                                  label_zeros)
-        cost = fluid.layers.elementwise_add(
-            fluid.layers.reduce_sum(
-                true_xent, dim=1),
-            fluid.layers.reduce_sum(
-                neg_xent, dim=1))
-        avg_cost = fluid.layers.reduce_mean(cost)
-
-        sgd_optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=1.0,
-                decay_steps=2100,
-                decay_rate=0.1,
-                staircase=True))
-        sgd_optimizer.minimize(avg_cost)
-
-    def transpiler_test_impl(self):
-        trainer, startup = self.get_trainer()
-
-        fake_init_ops = []
-        for op in startup.global_block().ops:
-            if op.type == "fake_init":
-                fake_init_ops.append(op)
-
-        self.assertEqual(len(fake_init_ops), 3)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index f72df8cbe4640941d014b310325a8bb56d8af65f..59b4afdf8b02d286728a8f801959e3d08f840e97 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -23,6 +23,7 @@ from paddle.fluid import Program, program_guard
 
 paddle.enable_static()
 
+
 class TestTransposeOp(OpTest):
     def setUp(self):
         self.init_op_type()
@@ -151,6 +152,7 @@ class TestTransposeOpError(unittest.TestCase):
 
             self.assertRaises(ValueError, test_each_elem_value_check)
 
+
 class TestTransposeApi(unittest.TestCase):
     def test_static_out(self):
         paddle.enable_static()
@@ -161,10 +163,11 @@ class TestTransposeApi(unittest.TestCase):
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             x_np = np.random.random([2, 3, 4]).astype("float32")
-            result1, result2 = exe.run(feed={"x": x_np}, fetch_list=[x_trans1, x_trans2])
+            result1, result2 = exe.run(feed={"x": x_np},
+                                       fetch_list=[x_trans1, x_trans2])
             expected_result1 = np.transpose(x_np, [1, 0, 2])
             expected_result2 = np.transpose(x_np, (2, 1, 0))
-            
+
             np.testing.assert_array_equal(result1, expected_result1)
             np.testing.assert_array_equal(result2, expected_result2)
 
@@ -185,6 +188,7 @@ class TestTransposeApi(unittest.TestCase):
         # dygraph test
         paddle.enable_static()
 
+
 class TestTAPI(unittest.TestCase):
     def test_out(self):
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_unfold_op.py b/python/paddle/fluid/tests/unittests/test_unfold_op.py
index e24368e052dd20b0a9bc50d3f7bae3846b35b86c..7295cb838160006202f8188d6fdb9877a83aa0d0 100644
--- a/python/paddle/fluid/tests/unittests/test_unfold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unfold_op.py
@@ -18,6 +18,9 @@ import math
 import numpy as np
 import unittest
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
 
 
 class TestUnfoldOp(OpTest):
@@ -98,5 +101,30 @@ class TestUnfoldOp(OpTest):
         self.check_grad(['X'], 'Y')
 
 
+class TestUnfoldAPI(TestUnfoldOp):
+    """
+    This is for test on paddle.nn.Unfold
+    """
+
+    def setUp(self):
+        self.op_type = 'unfold'
+        self.set_data()
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input = fluid.dygraph.to_variable(self.inputs['X'])
+                m = paddle.nn.Unfold(**self.attrs)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), self.outputs['Y']))
+
+    def test_info(self):
+        str(paddle.nn.Unfold(**self.attrs))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ba808a341e5eb02c7759f04cf44fff1e4365ece
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
@@ -0,0 +1,276 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+from paddle.fluid.tests.unittests.test_uniform_random_op import output_hist, output_hist_diag
+
+
+class TestUniformRandomOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.dtype = "uint16"
+        self.inputs = {}
+        self.init_attrs()
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("uint16")}
+
+    def init_attrs(self):
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10,
+            'dtype': int(core.VarDesc.VarType.BF16)
+        }
+        self.output_hist = output_hist
+
+    def verify_output(self, outs):
+        if np.array(outs[0]).dtype == np.uint16:
+            result = convert_uint16_to_float(np.array(outs[0]))
+        else:
+            result = np.array(outs[0])
+
+        hist, prob = self.output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+    def test_check_output(self):
+        outs = self.calc_output(core.CPUPlace())
+        outs = [np.array(out) for out in outs]
+        outs.sort(key=len)
+        self.verify_output(outs)
+
+
+class TestUniformRandomOpBF16AttrTensorList(TestUniformRandomOpBF16):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.new_shape = (1000, 784)
+        self.dtype = "uint16"
+        shape_tensor = []
+        for index, ele in enumerate(self.new_shape):
+            shape_tensor.append(("x" + str(index), np.ones(
+                (1)).astype("int64") * ele))
+        self.inputs = {'ShapeTensorList': shape_tensor}
+        self.init_attrs()
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("uint16")}
+
+    def init_attrs(self):
+        self.attrs = {
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10,
+            'dtype': int(core.VarDesc.VarType.BF16)
+        }
+        self.output_hist = output_hist
+
+
+class TestUniformRandomOpBF16AttrTensorInt32(
+        TestUniformRandomOpBF16AttrTensorList):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.dtype = "uint16"
+        self.inputs = {"ShapeTensor": np.array([1000, 784]).astype("int32")}
+        self.init_attrs()
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("uint16")}
+
+
+class TestUniformRandomOpBF16WithDiagInit(TestUniformRandomOpBF16):
+    def init_attrs(self):
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10,
+            "diag_num": 784,
+            "diag_step": 784,
+            "diag_val": 1.0,
+            'dtype': int(core.VarDesc.VarType.BF16)
+        }
+        self.output_hist = output_hist_diag
+
+
+class TestUniformRandomOpBF16SelectedRows(unittest.TestCase):
+    def test_check_output(self):
+        self.check_with_place(core.CPUPlace())
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+        paddle.seed(10)
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            shape=[1000, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            dtype=int(core.VarDesc.VarType.BF16))
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
+        result = convert_uint16_to_float(np.array(out.get_tensor()))
+        hist, prob = output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpBF16SelectedRowsWithDiagInit(
+        TestUniformRandomOpBF16SelectedRows):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+        paddle.seed(10)
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            shape=[500, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            diag_num=500,
+            diag_step=784,
+            diag_val=1.0,
+            dtype=int(core.VarDesc.VarType.BF16))
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [500, 784])
+        result = convert_uint16_to_float(np.array(out.get_tensor()))
+        hist, prob = output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpBF16AttrTensorAPI(unittest.TestCase):
+    def test_attr_tensor_API(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            dim_tensor = fluid.layers.fill_constant([1], "int64", 3)
+            ret = fluid.layers.nn.uniform_random(
+                [1, dim_tensor, 2], dtype=np.uint16)
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            exe.run(startup_program)
+            outs = exe.run(train_program, fetch_list=[ret])
+
+
+class TestUniformRandomOpAPISeed(unittest.TestCase):
+    def test_attr_tensor_API(self):
+        _seed = 10
+        gen = paddle.seed(_seed)
+        gen._is_init_py = False
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            _min = 5
+            _max = 10
+
+            ret = fluid.layers.nn.uniform_random(
+                [2, 3, 2], min=_min, max=_max, seed=_seed)
+            ret_2 = fluid.layers.nn.uniform_random(
+                [2, 3, 2], min=_min, max=_max, seed=_seed)
+            res = fluid.layers.equal(ret, ret_2)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            exe.run(startup_program)
+            ret_value, cmp_value = exe.run(train_program, fetch_list=[ret, res])
+            self.assertTrue(np.array(cmp_value).all())
+            for i in ret_value.flatten():
+                self.assertGreaterEqual(i, _min)
+                self.assertLess(i, _max)
+
+
+class TestUniformRandomOpBF16SelectedRowsShapeTensor(unittest.TestCase):
+    def test_check_output(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+        shape_tensor = scope.var("Shape").get_tensor()
+        shape_tensor.set(np.array([1000, 784]).astype("int64"), place)
+        paddle.seed(10)
+        op = Operator(
+            "uniform_random",
+            ShapeTensor="Shape",
+            Out="X",
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            dtype=int(core.VarDesc.VarType.BF16))
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
+        result = convert_uint16_to_float(np.array(out.get_tensor()))
+        hist, prob = output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpBF16SelectedRowsShapeTensorList(
+        TestUniformRandomOpBF16SelectedRowsShapeTensor):
+    def test_check_output(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+        shape_1 = scope.var("shape1").get_tensor()
+        shape_1.set(np.array([1000]).astype("int64"), place)
+        shape_2 = scope.var("shape2").get_tensor()
+        shape_2.set(np.array([784]).astype("int64"), place)
+        paddle.seed(10)
+        op = Operator(
+            "uniform_random",
+            ShapeTensorList=["shape1", "shape2"],
+            Out="X",
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            dtype=int(core.VarDesc.VarType.BF16))
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
+        result = convert_uint16_to_float(np.array(out.get_tensor()))
+        hist, prob = output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomBatchSizeLikeOpBF16API(unittest.TestCase):
+    def test_attr_tensorlist_int32_API(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name="input", shape=[1, 3], dtype='uint16')
+            out_1 = fluid.layers.uniform_random_batch_size_like(
+                input, [2, 4], dtype=np.uint16)  # out_1.shape=[1, 4]
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            exe.run(startup_program)
+            outs = exe.run(train_program, fetch_list=[out_1])
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index b0c9dda7a30987e71a648a691441f238572e2873..83f02b629d7acdf8de27ecc506e62701bf890d17 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -65,7 +65,8 @@ class TestVarBase(unittest.TestCase):
                 y = clone_x**2
                 y.backward()
                 self.assertTrue(
-                    np.array_equal(x.grad, np.array([2.4]).astype('float32')))
+                    np.array_equal(x.grad.numpy(),
+                                   np.array([2.4]).astype('float32')))
                 y = x.cpu()
                 self.assertEqual(y.place.__repr__(), "CPUPlace")
                 if core.is_compiled_with_cuda():
@@ -76,6 +77,11 @@ class TestVarBase(unittest.TestCase):
                     y = x.cuda(blocking=True)
                     self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
 
+                # support 'dtype' is core.VarType
+                x = paddle.rand((2, 2))
+                y = paddle.to_tensor([2, 2], dtype=x.dtype)
+                self.assertEqual(y.dtype, core.VarDesc.VarType.FP32)
+
                 # set_default_dtype take effect on complex
                 x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
                 self.assertTrue(np.array_equal(x.numpy(), [1 + 2j]))
@@ -137,6 +143,74 @@ class TestVarBase(unittest.TestCase):
                 self.assertEqual(y.dtype, core.VarDesc.VarType.COMPLEX64)
                 self.assertEqual(y.shape, [2])
 
+                paddle.set_default_dtype('float32')
+                x = paddle.randn([3, 4])
+                x_array = np.array(x)
+                self.assertEqual(x_array.shape, x.numpy().shape)
+                self.assertEqual(x_array.dtype, x.numpy().dtype)
+                self.assertTrue(np.array_equal(x_array, x.numpy()))
+
+                x = paddle.to_tensor(1.0)
+                self.assertEqual(x.item(), 1.0)
+                self.assertTrue(isinstance(x.item(), float))
+
+                x = paddle.randn([3, 2, 2])
+                self.assertTrue(isinstance(x.item(5), float))
+                self.assertTrue(isinstance(x.item(1, 0, 1), float))
+                self.assertEqual(x.item(5), x.item(1, 0, 1))
+                self.assertTrue(
+                    np.array_equal(x.item(1, 0, 1), x.numpy().item(1, 0, 1)))
+
+                x = paddle.to_tensor([[1.111111, 2.222222, 3.333333]])
+                self.assertEqual(x.item(0, 2), x.item(2))
+                self.assertAlmostEqual(x.item(2), 3.333333)
+                self.assertTrue(isinstance(x.item(0, 2), float))
+
+                x = paddle.to_tensor(1.0, dtype='float64')
+                self.assertEqual(x.item(), 1.0)
+                self.assertTrue(isinstance(x.item(), float))
+
+                x = paddle.to_tensor(1.0, dtype='float16')
+                self.assertEqual(x.item(), 1.0)
+                self.assertTrue(isinstance(x.item(), float))
+
+                x = paddle.to_tensor(1, dtype='uint8')
+                self.assertEqual(x.item(), 1)
+                print(type(x.item()))
+                self.assertTrue(isinstance(x.item(), int))
+
+                x = paddle.to_tensor(1, dtype='int8')
+                self.assertEqual(x.item(), 1)
+                self.assertTrue(isinstance(x.item(), int))
+
+                x = paddle.to_tensor(1, dtype='int16')
+                self.assertEqual(x.item(), 1)
+                self.assertTrue(isinstance(x.item(), int))
+
+                x = paddle.to_tensor(1, dtype='int32')
+                self.assertEqual(x.item(), 1)
+                self.assertTrue(isinstance(x.item(), int))
+
+                x = paddle.to_tensor(1, dtype='int64')
+                self.assertEqual(x.item(), 1)
+                self.assertTrue(isinstance(x.item(), long if six.PY2 else int))
+
+                x = paddle.to_tensor(True)
+                self.assertEqual(x.item(), True)
+                self.assertTrue(isinstance(x.item(), bool))
+
+                x = paddle.to_tensor(1 + 1j)
+                self.assertEqual(x.item(), 1 + 1j)
+                self.assertTrue(isinstance(x.item(), complex))
+
+                with self.assertRaises(ValueError):
+                    paddle.randn([3, 2, 2]).item()
+                with self.assertRaises(ValueError):
+                    paddle.randn([3, 2, 2]).item(18)
+                with self.assertRaises(ValueError):
+                    paddle.randn([3, 2, 2]).item(1, 2)
+                with self.assertRaises(ValueError):
+                    paddle.randn([3, 2, 2]).item(2, 1, 2)
                 with self.assertRaises(TypeError):
                     paddle.to_tensor('test')
                 with self.assertRaises(TypeError):
@@ -250,19 +324,21 @@ class TestVarBase(unittest.TestCase):
             detach_x = x.detach()
             self.assertTrue(detach_x.stop_gradient, True)
 
+            cmp_float = np.allclose if core.is_compiled_with_rocm(
+            ) else np.array_equal
             detach_x[:] = 10.0
-            self.assertTrue(np.array_equal(x.numpy(), [10.0]))
+            self.assertTrue(cmp_float(x.numpy(), [10.0]))
 
             y = x**2
             y.backward()
-            self.assertTrue(np.array_equal(x.grad, [20.0]))
+            self.assertTrue(cmp_float(x.grad.numpy(), [20.0]))
             self.assertEqual(detach_x.grad, None)
 
             detach_x.stop_gradient = False  # Set stop_gradient to be False, supported auto-grad
             z = 3 * detach_x**2
             z.backward()
-            self.assertTrue(np.array_equal(x.grad, [20.0]))
-            self.assertTrue(np.array_equal(detach_x.grad, [60.0]))
+            self.assertTrue(cmp_float(x.grad.numpy(), [20.0]))
+            self.assertTrue(cmp_float(detach_x.grad.numpy(), [60.0]))
 
             # Due to sharing of data with origin Tensor, There are some unsafe operations:
             with self.assertRaises(RuntimeError):
@@ -468,6 +544,70 @@ class TestVarBase(unittest.TestCase):
             np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
         self.assertTrue(np.array_equal(local_out[16], tensor_array[-4:4]))
 
+    def _test_slice_for_tensor_attr(self):
+        tensor_array = np.array(
+            [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+             [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+             [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype('float32')
+
+        var = paddle.to_tensor(tensor_array)
+
+        one = paddle.ones(shape=[1], dtype="int32")
+        two = paddle.full(shape=[1], fill_value=2, dtype="int32")
+        negative_one = paddle.full(shape=[1], fill_value=-1, dtype="int32")
+        four = paddle.full(shape=[1], fill_value=4, dtype="int32")
+
+        var = fluid.dygraph.to_variable(tensor_array)
+        var1 = var[0, one, one]
+        var2 = var[one:]
+        var3 = var[0:one]
+        var4 = var[::negative_one]
+        var5 = var[one, one:, one:]
+        var_reshape = fluid.layers.reshape(var, [3, negative_one, 3])
+        var6 = var_reshape[:, :, negative_one]
+        var7 = var[:, :, :negative_one]
+        var8 = var[:one, :one, :1]
+        var9 = var[:-1, :negative_one, :negative_one]
+        var10 = var[::negative_one, :one, :negative_one]
+        var11 = var[:negative_one, ::-1, negative_one:]
+        var12 = var[one:2, 2:, ::negative_one]
+        var13 = var[two:10, 2:, -2:negative_one]
+        var14 = var[1:negative_one, 0:2, ::negative_one]
+        var15 = var[::negative_one, ::-1, ::negative_one]
+        var16 = var[-4:4]
+
+        vars = [
+            var, var1, var2, var3, var4, var5, var6, var7, var8, var9, var10,
+            var11, var12, var13, var14, var15, var16
+        ]
+        local_out = [var.numpy() for var in vars]
+
+        self.assertTrue(np.array_equal(local_out[1], tensor_array[0, 1, 1:2]))
+        self.assertTrue(np.array_equal(local_out[2], tensor_array[1:]))
+        self.assertTrue(np.array_equal(local_out[3], tensor_array[0:1]))
+        self.assertTrue(np.array_equal(local_out[4], tensor_array[::-1]))
+        self.assertTrue(np.array_equal(local_out[5], tensor_array[1, 1:, 1:]))
+        self.assertTrue(
+            np.array_equal(local_out[6],
+                           tensor_array.reshape((3, -1, 3))[:, :, -1]))
+        self.assertTrue(np.array_equal(local_out[7], tensor_array[:, :, :-1]))
+        self.assertTrue(np.array_equal(local_out[8], tensor_array[:1, :1, :1]))
+        self.assertTrue(
+            np.array_equal(local_out[9], tensor_array[:-1, :-1, :-1]))
+        self.assertTrue(
+            np.array_equal(local_out[10], tensor_array[::-1, :1, :-1]))
+        self.assertTrue(
+            np.array_equal(local_out[11], tensor_array[:-1, ::-1, -1:]))
+        self.assertTrue(
+            np.array_equal(local_out[12], tensor_array[1:2, 2:, ::-1]))
+        self.assertTrue(
+            np.array_equal(local_out[13], tensor_array[2:10, 2:, -2:-1]))
+        self.assertTrue(
+            np.array_equal(local_out[14], tensor_array[1:-1, 0:2, ::-1]))
+        self.assertTrue(
+            np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
+        self.assertTrue(np.array_equal(local_out[16], tensor_array[-4:4]))
+
     def _test_for_var(self):
         np_value = np.random.random((30, 100, 100)).astype('float32')
         w = fluid.dygraph.to_variable(np_value)
@@ -478,6 +618,7 @@ class TestVarBase(unittest.TestCase):
     def test_slice(self):
         with fluid.dygraph.guard():
             self._test_slice()
+            self._test_slice_for_tensor_attr()
             self._test_for_var()
 
             var = fluid.dygraph.to_variable(self.array)
@@ -497,6 +638,15 @@ class TestVarBase(unittest.TestCase):
                 np.array_equal(var.numpy(),
                                fluid.framework._var_base_to_np(var)))
 
+    def test_var_base_as_np(self):
+        with fluid.dygraph.guard():
+            var = fluid.dygraph.to_variable(self.array)
+            self.assertTrue(np.array_equal(var.numpy(), np.array(var)))
+            self.assertTrue(
+                np.array_equal(
+                    var.numpy(), np.array(
+                        var, dtype=np.float32)))
+
     def test_if(self):
         with fluid.dygraph.guard():
             var1 = fluid.dygraph.to_variable(np.array([[[0]]]))
@@ -617,6 +767,18 @@ class TestVarBase(unittest.TestCase):
         self.assertEqual(a_str, expected)
         paddle.enable_static()
 
+    def test_tensor_str_shape_with_zero(self):
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.ones((10, 10))
+        y = paddle.fluid.layers.where(x == 0)
+        a_str = str(y)
+
+        expected = '''Tensor(shape=[0, 2], dtype=int64, place=CPUPlace, stop_gradient=True,
+       [])'''
+
+        self.assertEqual(a_str, expected)
+        paddle.enable_static()
+
     def test_print_tensor_dtype(self):
         paddle.disable_static(paddle.CPUPlace())
         a = paddle.rand([1])
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 8d5ab0a5be757a3f6f0c86854d281210e01d3d99..690ac46e563ef04a7ea416aa9615441245f6a56b 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -190,7 +190,6 @@ class TestVariable(unittest.TestCase):
         with fluid.dygraph.guard():
             self.assertRaises(AssertionError, var.detach)
             self.assertRaises(AssertionError, var.numpy)
-            self.assertRaises(AssertionError, var.set_value, None)
             self.assertRaises(AssertionError, var.backward)
             self.assertRaises(AssertionError, var.gradient)
             self.assertRaises(AssertionError, var.clear_gradient)
diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 6310a76d8d000ad05fd3a0d3dd7e4a2ac4d15f1a..53f3b3cf53d765598774e65fa95e1561bde9aa0a 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -20,6 +20,7 @@ import numpy as np
 from op_test import OpTest
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 import paddle
 import paddle.nn.functional as F
@@ -240,8 +241,18 @@ class TestWarpCTCOp(OpTest):
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(
-            ["Logits"], "Loss", max_relative_error=0.007, check_dygraph=False)
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.009,
+                check_dygraph=False)
+        else:
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.007,
+                check_dygraph=False)
 
 
 class TestWarpCTCOpCase1(TestWarpCTCOp):
@@ -335,8 +346,18 @@ class TestWarpCTCOpWithPadding(OpTest):
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(
-            ["Logits"], "Loss", max_relative_error=0.007, check_dygraph=False)
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.009,
+                check_dygraph=False)
+        else:
+            self.check_grad(
+                ["Logits"],
+                "Loss",
+                max_relative_error=0.007,
+                check_dygraph=False)
 
 
 class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
index 110e7bb3cbf41c43f964cfffb3d40b8eff5948d0..3eefa0bce886367ad8b80f30a5bfd884ae613ded 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
@@ -85,11 +85,8 @@ class TestAssignOpError(unittest.TestCase):
             # When the type of input is Variable, the dtype of input must be float16, float32, float64, int32, int64, bool.
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
             self.assertRaises(TypeError, fluid.layers.assign, x3)
-            # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            x4 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, fluid.layers.assign, x4)
-            x5 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, fluid.layers.assign, x5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
index cb64cb90e8c2c712771368fccedea734ad00883e..f1ba8828f2b335224e3d8fb819e628410b0343d3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py
@@ -51,10 +51,10 @@ class TestCastOp2(op_test.OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
-        self.outputs = {'Out': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float16')}
         self.attrs = {
             'in_dtype': int(core.VarDesc.VarType.FP32),
-            'out_dtype': int(core.VarDesc.VarType.FP32)
+            'out_dtype': int(core.VarDesc.VarType.FP16)
         }
         self.op_type = 'cast'
 
@@ -68,10 +68,10 @@ class TestCastOp2(op_test.OpTest):
 class TestCastOp3(op_test.OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
-        self.inputs = {'X': ipt.astype('float32')}
+        self.inputs = {'X': ipt.astype('float16')}
         self.outputs = {'Out': ipt.astype('float32')}
         self.attrs = {
-            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'in_dtype': int(core.VarDesc.VarType.FP16),
             'out_dtype': int(core.VarDesc.VarType.FP32)
         }
         self.op_type = 'cast'
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
index fa0feb02f4378ea05784caca79a92aaa370c5bed..54dc46cd0ec3eea5831e072450084af1fe4b5271 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py
@@ -27,8 +27,12 @@ from paddle.fluid import Program, program_guard
 paddle.enable_static()
 
 
-def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y,
+                               batch_size):
     BATCH_SIZE = 2
+    if batch_size != None:
+        BATCH_SIZE = batch_size
+
     M = 3
     N = 4
     K = 5
@@ -58,6 +62,13 @@ def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
             shape_Y = [K, N]
     if dim_Y == 3:
         shape_Y = [BATCH_SIZE] + shape_Y
+
+    if dim_Y == 3 and dim_X == 2:
+        if transpose_X == False:
+            shape_X[1] = shape_X[1] * BATCH_SIZE
+        else:
+            shape_X[0] = shape_X[0] * BATCH_SIZE
+
     return shape_X, shape_Y
 
 
@@ -77,11 +88,19 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
     if transpose_Y:
         if Y.ndim == 1:
             Y = Y.reshape((1, Y.size))
+        elif Y.ndim == 2:
+            Y = Y.T
         else:
             dim = [i for i in range(len(Y.shape))]
             dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
             Y = np.transpose(Y, tuple(dim))
 
+    if X.ndim == 3 and Y.ndim == 2:
+        x_dims = X.shape
+        X = X.reshape((x_dims[0] * x_dims[1], x_dims[2]))
+    if Y.ndim == 3 and X.ndim == 2:
+        y_dims = Y.shape
+        Y = Y.reshape((y_dims[0] * y_dims[1], y_dims[2]))
     Out = np.matmul(X, Y)
     if not Out.shape:
         # We do not support 0-dimensional Tensors (scalars). So where
@@ -203,11 +222,11 @@ def test_negative_dims_program(obj):
 
 
 # Generate program api cases for all negative possibilities
-def api_test(dim_x, dim_y, trans_x, trans_y):
+def api_test(dim_x, dim_y, trans_x, trans_y, batch_size):
     test_name = ('TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
         dim_x, dim_y, trans_x, trans_y))
     shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
-                                                  trans_y)
+                                                  trans_y, batch_size)
     globals()[test_name] = type(test_name, (unittest.TestCase, ), {
         'shape_X': shape_x,
         'shape_Y': shape_y,
@@ -218,29 +237,35 @@ def api_test(dim_x, dim_y, trans_x, trans_y):
 
 
 # Generate operators cases for all possibilities
-def inject_test(dim_x, dim_y, trans_x, trans_y):
-    test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
-        dim_x, dim_y, trans_x, trans_y))
+def inject_test(dim_x, dim_y, trans_x, trans_y, batch_size):
+    test_name = (
+        'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.format(
+            dim_x, dim_y, trans_x, trans_y, batch))
     shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x,
-                                                  trans_y)
+                                                  trans_y, batch_size)
     globals()[test_name] = type(test_name, (Generator, XPUOpTest), {
         'shape_X': shape_x,
         'shape_Y': shape_y,
         'transpose_X': trans_x,
         'transpose_Y': trans_y,
+        'op_type': "matmul"
     })
 
 
-for dim_X in (1, 2, 3):
-    for dim_Y in (1, 2, 3):
-        transose_x = False
-        transose_y = False
-        if dim_X == 3 and dim_Y == 3:
-            inject_test(dim_X, dim_Y, transose_x, transose_y)
-            api_test(dim_X, dim_Y, transose_x, transose_y)
+xpu_support_dims_list = [[1, 1], [2, 2], [3, 3]]
+batch_size = [2, 4, 5, 10, 50, 100, 300]
+for dims in xpu_support_dims_list:
+    dim_X = dims[0]
+    dim_Y = dims[1]
+    for transose_x in (False, True):
+        for transose_y in (False, True):
+            for batch in batch_size:
+                inject_test(dim_X, dim_Y, transose_x, transose_y, batch)
+            # xpu not support all negative possibilities
+            # api_test(dim_X, dim_Y, False, False, 10)
 
 
-# Test case n-dim
+            # Test case n-dim
 def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     M = 2
     N = 4
@@ -261,7 +286,7 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y):
     return shape_X, shape_Y
 
 
-# # Test case n-dim
+# Test case n-dim
 for dim in [4]:
     for transpose_X in [False, True]:
         for transpose_Y in [False, True]:
@@ -275,6 +300,7 @@ for dim in [4]:
                 'shape_Y': shape_Y,
                 'transpose_X': transpose_X,
                 'transpose_Y': transpose_Y,
+                'op_type': "matmul"
             })
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 531e9488d602de89360bc4f3e733264371c2eb9f..435026220c2b59a0f8df73f071673dab044e8348 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -45,7 +45,6 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
             dim = [i for i in range(len(Y.shape))]
             dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
             Y = np.transpose(Y, tuple(dim))
-
     Out = np.matmul(X, Y)
     if not Out.shape:
         # We do not support 0-dimensional Tensors (scalars). So where
@@ -98,16 +97,16 @@ class TestMatMulV2Op(XPUOpTest):
         self.check_grad_with_place(place, ['X', 'Y'], 'Out')
 
 
-# class TestMatMuklOp2(TestMatMulV2Op):
-#     """
-#     case 2
-#     """
+class TestMatMuklOp2(TestMatMulV2Op):
+    """
+    case 2
+    """
 
-#     def config(self):
-#         self.x_shape = (100, )
-#         self.y_shape = (1, 3, 2, 100)
-#         self.trans_x = False
-#         self.trans_y = True
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (100, 3)
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp3(TestMatMulV2Op):
@@ -122,16 +121,16 @@ class TestMatMuklOp3(TestMatMulV2Op):
         self.trans_y = False
 
 
-# class TestMatMuklOp4(TestMatMulV2Op):
-#     """
-#     case 4
-#     """
+class TestMatMuklOp4(TestMatMulV2Op):
+    """
+    case 4
+    """
 
-#     def config(self):
-#         self.x_shape = (100, )
-#         self.y_shape = (1, 2, 100, 2)
-#         self.trans_x = False
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (1, 1, 100, 1)
+        self.y_shape = (1, 100)
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp5(TestMatMulV2Op):
@@ -146,27 +145,28 @@ class TestMatMuklOp5(TestMatMulV2Op):
         self.trans_y = False
 
 
-# class TestMatMuklOp6(TestMatMulV2Op):
-#     """
-#     case 6
-#     """
+class TestMatMuklOp6(TestMatMulV2Op):
+    """
+    case 6
+    """
 
-#     def config(self):
-#         self.x_shape = (1, 2, 102, 1)
-#         self.y_shape = (102, )
-#         self.trans_x = True
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (1, 2, 102, 10)
+        self.y_shape = (2, 10, 111)
+        self.trans_x = False
+        self.trans_y = False
 
-# class TestMatMuklOp7(TestMatMulV2Op):
-#     """
-#     case 7
-#     """
 
-#     def config(self):
-#         self.x_shape = (1, 2, 1, 100)
-#         self.y_shape = (100, )
-#         self.trans_x = False
-#         self.trans_y = False
+class TestMatMuklOp7(TestMatMulV2Op):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (2, 100, 12)
+        self.trans_x = True
+        self.trans_y = False
 
 
 class TestMatMuklOp8(TestMatMulV2Op):
@@ -181,49 +181,52 @@ class TestMatMuklOp8(TestMatMulV2Op):
         self.trans_y = False
 
 
-# class TestMatMuklOp9(TestMatMulV2Op):
-#     """
-#     case 9
-#     """
+class TestMatMuklOp9(TestMatMulV2Op):
+    """
+    case 9
+    """
 
-#     def config(self):
-#         self.x_shape = (1, 1, 1, 100)
-#         self.y_shape = (2, 1, 2, 100)
-#         self.trans_x = False
-#         self.trans_y = True
+    def config(self):
+        self.x_shape = (100, 20, 100)
+        self.y_shape = (100, 100, 100)
+        self.trans_x = False
+        self.trans_y = True
 
-# class TestMatMuklOp10(TestMatMulV2Op):
-#     """
-#     case 10
-#     """
 
-#     def config(self):
-#         self.x_shape = (1, 1, 25, 4)
-#         self.y_shape = (1, 2, 4, 25)
-#         self.trans_x = False
-#         self.trans_y = False
+class TestMatMuklOp10(TestMatMulV2Op):
+    """
+    case 10
+    """
 
-# class TestMatMuklOp11(TestMatMulV2Op):
-#     """
-#     case 11
-#     """
+    def config(self):
+        self.x_shape = (100, 20, 100)
+        self.y_shape = (100, 20, 100)
+        self.trans_x = True
+        self.trans_y = False
 
-#     def config(self):
-#         self.x_shape = (2, 1, 2, 100)
-#         self.y_shape = (1, 1, 100, 2)
-#         self.trans_x = False
-#         self.trans_y = False
 
-# class TestMatMuklOp12(TestMatMulV2Op):
-#     """
-#     case 12
-#     """
+class TestMatMuklOp11(TestMatMulV2Op):
+    """
+    case 11
+    """
 
-#     def config(self):
-#         self.x_shape = (2, 1, 4, 25)
-#         self.y_shape = (1, 1, 4, 25)
-#         self.trans_x = True
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (2, 20, 100)
+        self.y_shape = (100, 30)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp12(TestMatMulV2Op):
+    """
+    case 12
+    """
+
+    def config(self):
+        self.x_shape = (1, 20, 100)
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp13(TestMatMulV2Op):
@@ -238,38 +241,40 @@ class TestMatMuklOp13(TestMatMulV2Op):
         self.trans_y = False
 
 
-# class TestMatMuklOp14(TestMatMulV2Op):
-#     """
-#     case 14_1
-#     """
+class TestMatMuklOp14(TestMatMulV2Op):
+    """
+    case 14_1
+    """
 
-#     def config(self):
-#         self.x_shape = (3, 1, 6, 6)
-#         self.y_shape = (1, 2, 6, 9)
-#         self.trans_x = True
-#         self.trans_y = False
+    def config(self):
+        self.x_shape = (100, 2, 100, 10)
+        self.y_shape = (100, 2, 10, 90)
+        self.trans_x = False
+        self.trans_y = False
 
-# class TestMatMuklOp15(TestMatMulV2Op):
-#     """
-#     case 14_2
-#     """
 
-#     def config(self):
-#         self.x_shape = (3, 1, 6, 6)
-#         self.y_shape = (1, 2, 6, 9)
-#         self.trans_x = False
-#         self.trans_y = False
+class TestMatMuklOp15(TestMatMulV2Op):
+    """
+    case 14_2
+    """
 
-# class TestMatMuklOp16(TestMatMulV2Op):
-#     """
-#     case 16 : to check the gradient for special case
-#     """
+    def config(self):
+        self.x_shape = (100, 2, 100, 10)
+        self.y_shape = (100, 2, 100, 10)
+        self.trans_x = False
+        self.trans_y = True
 
-#     def config(self):
-#         self.x_shape = (100)
-#         self.y_shape = (1, 2, 2, 100, 2)
-#         self.trans_x = False
-#         self.trans_y = False
+
+class TestMatMuklOp16(TestMatMulV2Op):
+    """
+    case 16 : to check the big data
+    """
+
+    def config(self):
+        self.x_shape = (1000, 2, 100, 100)
+        self.y_shape = (1000, 2, 100, 900)
+        self.trans_x = False
+        self.trans_y = False
 
 
 class TestMatMuklOp17(TestMatMulV2Op):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
new file mode 100755
index 0000000000000000000000000000000000000000..a27d806319cb26c1ca96331f9ee0a3c4d7ec204e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import random
+import sys
+
+sys.path.append("..")
+from op_test_xpu import XPUOpTest
+sys.path.append("../rnn")
+from rnn_numpy import SimpleRNN, LSTM, GRU
+from convert import get_params_for_net
+
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
+
+
+class TestRNNOp(XPUOpTest):
+    def init_size(self):
+        self.seq_length = 1
+        self.batch_size = 1
+        self.input_size = 5
+        self.hidden_size = 16
+
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.weight_{}".format(i, j))
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.bias_{}".format(i, j))
+        return weight_names
+
+    def setUp(self):
+        self.init_size()
+        self.op_type = "rnn"
+        self.dtype = np.float32
+        self.sequence_length = np.ones(
+            (self.batch_size, ), dtype=np.int32) * self.seq_length
+        self.num_layers = 1
+        self.is_bidirec = False
+        self.mode = "LSTM"
+        self.is_test = False
+        self.dropout = 0.0
+        self.set_attrs()
+
+        self.direction_num = 2 if self.is_bidirec else 1
+        direction = "bidirectional" if self.is_bidirec else "forward"
+
+        input = np.random.uniform(
+            low=-0.1,
+            high=0.1,
+            size=(self.seq_length, self.batch_size,
+                  self.input_size)).astype(self.dtype)
+
+        rnn1 = LSTM(
+            self.input_size,
+            self.hidden_size,
+            num_layers=self.num_layers,
+            time_major=True,
+            direction=direction,
+            dropout=self.dropout,
+            dtype="float32")
+
+        flat_w = get_params_for_net(rnn1)
+        output, (last_hidden, last_cell) = rnn1(
+            input, sequence_length=self.sequence_length)
+
+        init_h = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        init_c = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        state_out = np.ndarray((300)).astype("uint8")
+
+        self.inputs = {
+            'Input': input,
+            'WeightList': flat_w,
+            'PreState': [('init_h', init_h), ('init_c', init_c)],
+            'SequenceLength': self.sequence_length
+        }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'PreState': [('init_h', init_h), ('init_c', init_c)],
+            }
+        self.attrs = {
+            'dropout_prob': self.dropout,
+            'is_bidirec': self.is_bidirec,
+            'input_size': self.input_size,
+            'hidden_size': self.hidden_size,
+            'num_layers': self.num_layers,
+            'mode': self.mode,
+            'is_test': self.is_test
+        }
+        self.outputs = {
+            'Out': output,
+            "State": [('last_hidden', last_hidden), ('last_cell', last_cell)],
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'DropoutState': state_out
+        }
+
+    def test_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(
+                place, atol=0.01, no_check_set=['Reserve', 'DropoutState'])
+
+    def set_attrs(self):
+        pass
+
+    def test_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            if not self.is_test:
+                var_name_list = self.get_weight_names()
+                grad_check_list = ['Input', 'init_h', 'init_c']
+                grad_check_list.extend(var_name_list)
+                self.check_grad_with_place(
+                    place,
+                    set(grad_check_list), ['Out', 'last_hidden', 'last_cell'],
+                    max_relative_error=0.1)
+
+
+class TestRNNOpCase0(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 2
+        self.batch_size = 4
+        self.input_size = 10
+        self.hidden_size = 32
+
+
+class TestRNNOpCase1(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 5
+        self.batch_size = 16
+        self.input_size = 30
+        self.hidden_size = 64
+
+
+class TestRNNOpCase2(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 10
+        self.batch_size = 64
+        self.input_size = 50
+        self.hidden_size = 64
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index c61141bcd322cc12b81cc6957271bb8d23d213bc..00dea8d1251f4b2446fce13ca8aff665a35d0d97 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -49,8 +49,8 @@ class TrainerFactory(object):
             device_worker = Hogwild()
             trainer._set_device_worker(device_worker)
         else:
-            trainer_class = opt_info["trainer"]
-            device_worker_class = opt_info["device_worker"]
+            trainer_class = opt_info.get("trainer", "MultiTrainer")
+            device_worker_class = opt_info.get("device_worker", "Hogwild")
             trainer = globals()[trainer_class]()
             device_worker = globals()[device_worker_class]()
 
diff --git a/python/paddle/fluid/transpiler/ascend_transpiler.py b/python/paddle/fluid/transpiler/ascend_transpiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..5593c91b5bc6461b1df2bb2d2d7ae6567674e7e2
--- /dev/null
+++ b/python/paddle/fluid/transpiler/ascend_transpiler.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import collective
+from .. import core
+OpRole = core.op_proto_and_checker_maker.OpRole
+from paddle.distributed import fleet
+
+
+class AscendTranspiler(collective.Collective):
+    def __init__(self, startup_program, main_program):
+        self.nrings = 1
+        super(AscendTranspiler, self).__init__(self.nrings)
+        self._startup_program = startup_program
+        self._main_program = main_program
+
+    def _insert_allreduce_ops(self):
+        block = self._main_program.global_block()
+        ring_id = -1
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if self._is_backward_op(op) and \
+                    self.op_role_var_key in op.attr_names:
+                op_role_var = op.all_attrs()[self.op_role_var_key]
+
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    grad = block.vars[op_role_var[i + 1]]
+                    if param.is_distributed:
+                        continue
+
+                    # As we search ops reversedly, we should insert c_allreduce_sum
+                    # op in the same way to keep the ring_id alternate
+                    ring_id = (ring_id + 1) % self.nrings
+                    block._insert_op(
+                        offset + 1,
+                        type='c_allreduce_sum',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            self.op_role_key: OpRole.Backward
+                        })
+                    block._insert_op(
+                        offset + 2,
+                        type='scale',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'scale': 1.0 / fleet.worker_num(),
+                            self.op_role_key: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+    def transpile(self):
+        self._insert_allreduce_ops()
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index 752ec0672c216bd7487c98392b18651cefe1f995..ef6975c3d241e5de0a4dab17e88ebf6896472f32 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import sys
 import math
 from functools import reduce
+import os
 
 import collections
 import six
@@ -101,34 +102,64 @@ class Collective(object):
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
+        block = program.global_block()
+
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
         block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=unique_name.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-                self.op_role_key: OpRole.Forward
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': ring_id,
-                self.op_role_key: OpRole.Forward
-            })
+        if core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+            block.append_op(
+                type='c_gen_hccl_id',
+                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
+                outputs={},
+                attrs={
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    'device_id': int(os.getenv("FLAGS_selected_npus")),
+                    'rank_ids': nranks,
+                    self.op_role_key: OpRole.Forward
+                })
+        else:
+            nccl_id_var = block.create_var(
+                name=unique_name.generate('nccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            block.append_op(
+                type='c_gen_nccl_id',
+                inputs={},
+                outputs={'Out': nccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init',
+                inputs={'X': nccl_id_var},
+                outputs={},
+                attrs={
+                    'nranks': nranks,
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    self.op_role_key: OpRole.Forward
+                })
 
     def _broadcast_params(self):
         block = self.startup_program.global_block()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index f2b6888d7a753d0e0bda7a0c01139e0f222a89cb..ce84fb739c0009c62bfd2c9c9d9fd74255c96312 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -13,29 +13,26 @@
 # limitations under the License.
 
 # TODO: import framework api under this directory 
-__all__ = [
-    'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace',
-    'get_default_dtype', 'set_default_dtype'
-]
 
-__all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
+from . import random  # noqa: F401
+from .random import seed  # noqa: F401
+from .framework import get_default_dtype  # noqa: F401
+from .framework import set_default_dtype  # noqa: F401
+from .framework import set_grad_enabled  # noqa: F401
 
-from . import random
-from .random import seed
-from .framework import get_default_dtype
-from .framework import set_default_dtype
+from ..fluid.param_attr import ParamAttr  # noqa: F401
+from ..fluid.layers.tensor import create_parameter  # noqa: F401
+from ..fluid.core import CPUPlace  # noqa: F401
+from ..fluid.core import CUDAPlace  # noqa: F401
+from ..fluid.core import CUDAPinnedPlace  # noqa: F401
+from ..fluid.core import NPUPlace  # noqa: F401
+from ..fluid.core import VarBase  # noqa: F401
 
-from ..fluid.param_attr import ParamAttr  #DEFINE_ALIAS
-# from ..fluid.layers.tensor import create_global_var  #DEFINE_ALIAS
-from ..fluid.layers.tensor import create_parameter  #DEFINE_ALIAS
-from ..fluid.core import CPUPlace  #DEFINE_ALIAS
-from ..fluid.core import CUDAPlace  #DEFINE_ALIAS
-from ..fluid.core import CUDAPinnedPlace  #DEFINE_ALIAS
-from ..fluid.core import VarBase  #DEFINE_ALIAS
+from paddle.fluid import core  # noqa: F401
+from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
+from ..fluid.dygraph.base import grad  # noqa: F401
+from .io import save  # noqa: F401
+from .io import load  # noqa: F401
+from ..fluid.dygraph.parallel import DataParallel  # noqa: F401
 
-from paddle.fluid import core  #DEFINE_ALIAS
-from ..fluid.dygraph.base import no_grad_ as no_grad  #DEFINE_ALIAS
-from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
-from .io import save
-from .io import load
-from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
+__all__ = []
diff --git a/python/paddle/framework/dtype.py b/python/paddle/framework/dtype.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49f748975882daa30db5432b5893b86b963075b
--- /dev/null
+++ b/python/paddle/framework/dtype.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.core import VarDesc
+
+dtype = VarDesc.VarType
+dtype.__qualname__ = "dtype"
+dtype.__module__ = "paddle"
+
+uint8 = VarDesc.VarType.UINT8
+int8 = VarDesc.VarType.INT8
+int16 = VarDesc.VarType.INT16
+int32 = VarDesc.VarType.INT32
+int64 = VarDesc.VarType.INT64
+
+float32 = VarDesc.VarType.FP32
+float64 = VarDesc.VarType.FP64
+float16 = VarDesc.VarType.FP16
+bfloat16 = VarDesc.VarType.BF16
+
+complex64 = VarDesc.VarType.COMPLEX64
+complex128 = VarDesc.VarType.COMPLEX128
+
+bool = VarDesc.VarType.BOOL
+
+__all__ = []
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 41ec18ce32d3036c3db86aaa98053f59ff61f717..17eaa82cd8b6a049ce99396fe0aaf2ab0476a182 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -15,9 +15,11 @@
 # TODO: define framework api 
 from paddle.fluid.layer_helper_base import LayerHelperBase
 from paddle.fluid.data_feeder import convert_dtype
+from paddle.fluid.framework import _dygraph_tracer
 import numpy as np
+from contextlib import contextmanager
 
-__all__ = ['set_default_dtype', 'get_default_dtype']
+__all__ = []
 
 
 def set_default_dtype(d):
@@ -80,3 +82,37 @@ def get_default_dtype():
             paddle.get_default_dtype()
     """
     return LayerHelperBase.get_default_dtype()
+
+
+@contextmanager
+def set_grad_enabled(mode):
+    """
+    :api_attr: imperative
+
+    Create a context which enables or disables dygraph gradient calculation.
+
+    Args:
+        mode(bool): whether to enable (`True`), or disable (`False`) grad.
+
+    Examples:
+        .. code-block:: python
+            x = paddle.ones([3, 2])
+            x.stop_gradient = False
+            with torch.set_grad_enabled(False):
+                y = x * 2
+                with torch.set_grad_enabled(True):
+                    z = x * 2
+            print(y.stop_gradient)   # True
+            print(z.stop_gradient)   # False
+    """
+
+    tracer = _dygraph_tracer()
+    if tracer:
+        prev_mode = tracer._has_grad
+        tracer._has_grad = mode
+        try:
+            yield
+        finally:
+            tracer._has_grad = prev_mode
+    else:
+        yield
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 3d93bed32ecc4da923e564ac0030d84cf01d697a..493574c5bef47a50c633a95488d33fdd7b7dd29d 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -22,21 +22,23 @@ import warnings
 import sys
 import numpy as np
 
+if not six.PY2:
+    import copyreg
+
 import paddle
 
 # deprecated module import
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict
-from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer
+from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac
+from paddle.fluid.io import _legacy_save as _legacy_static_save
+
+from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place, Program
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
-__all__ = [
-    'save',
-    'load',
-]
+__all__ = []
 
 
 def _build_saved_state_dict(state_dict):
@@ -181,7 +183,9 @@ def _build_load_path_and_config(path, config):
 
 
 def _parse_load_config(configs):
-    supported_configs = ['model_filename', 'params_filename', 'keep_name_table']
+    supported_configs = [
+        'model_filename', 'params_filename', 'keep_name_table', 'return_numpy'
+    ]
 
     # input check
     for key in configs:
@@ -195,16 +199,304 @@ def _parse_load_config(configs):
     inner_config.model_filename = configs.get('model_filename', None)
     inner_config.params_filename = configs.get('params_filename', None)
     inner_config.keep_name_table = configs.get('keep_name_table', None)
+    inner_config.return_numpy = configs.get('return_numpy', False)
+
+    return inner_config
+
+
+def _parse_save_config(configs):
+    supported_configs = ['use_binary_format', 'pickle_protocol']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.save` is not supported."
+                % key)
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.use_binary_format = configs.get('use_binary_format', False)
+    inner_config.pickle_protocol = configs.get('pickle_protocol', None)
 
     return inner_config
 
 
-def save(obj, path, pickle_protocol=2):
+def _pickle_save(obj, f, protocol):
+    # TODO(weixin):add support for BytesIO.
+    if not isinstance(protocol, int):
+        raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
+            type(protocol)))
+
+    if protocol < 2 or protocol > 4:
+        raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
+                         format(protocol))
+
+    def reudce_varbase(self):
+        data = self.numpy()
+        name = self.name
+
+        return (tuple, ((name, data), ))
+
+    def reduce_LoDTensor(self):
+        data = np.array(self)
+
+        return (eval, ('data', {'data': data}))
+
+    def add_dispatch_table():
+        # This is not a good method, because the pickle module has been modified.
+        pickle.dispatch_table[core.VarBase] = reudce_varbase
+        pickle.dispatch_table[ParamBase] = reudce_varbase
+        pickle.dispatch_table[core.LoDTensor] = reduce_LoDTensor
+
+    def pop_dispatch_table():
+        pickle.dispatch_table.pop(core.VarBase)
+        pickle.dispatch_table.pop(core.LoDTensor)
+        pickle.dispatch_table.pop(ParamBase)
+
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        add_dispatch_table()
+        pickle_bytes = pickle.dumps(obj)
+        pop_dispatch_table()
+
+        max_bytes = 2**30
+        for i in range(0, len(pickle_bytes), max_bytes):
+            f.write(pickle_bytes[i:i + max_bytes])
+    else:
+        if six.PY2:
+            add_dispatch_table()
+            pickle_bytes = pickle.dump(obj, f, protocol)
+            pop_dispatch_table()
+        else:
+            pickler = pickle.Pickler(f, protocol)
+            pickler.dispatch_table = copyreg.dispatch_table.copy()
+
+            pickler.dispatch_table[core.VarBase] = reudce_varbase
+            pickler.dispatch_table[core.LoDTensor] = reduce_LoDTensor
+            pickler.dispatch_table[ParamBase] = reudce_varbase
+
+            pickler.dump(obj)
+
+
+def _contain_x(obj, condition_func):
+    if isinstance(obj, core.SelectedRows):
+        raise NotImplementedError(
+            "`paddle.save` do not support saving 'SelectedRows'.")
+
+    if condition_func(obj):
+        return True
+    elif type(obj) in (dict, collections.OrderedDict, list, tuple):
+        if type(obj) in (dict, collections.OrderedDict):
+            keys = list(obj.keys())
+        else:
+            keys = range(len(obj))
+        flag = False
+        for key in keys:
+            flag |= _contain_x(obj[key], condition_func)
+            if flag:
+                return True
+        return flag
+    else:
+        return False
+
+
+def _is_state_dict(obj):
+    if isinstance(obj, dict):
+
+        def condition(obj):
+            return isinstance(obj, (core.Layer, Program, core.VarBase,
+                                    core.LoDTensor, core.SelectedRows))
+
+        # If the value of a dict is a core.VarBase/LoDTensor or a dict 
+        # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows), 
+        # the dict is considered to be a state_ dict.
+        for key, value in obj.items():
+            if isinstance(value, dict):
+                for k, v in value.items():
+                    if _contain_x(v, condition):
+                        return False
+            elif not isinstance(value, (core.VarBase, core.LoDTensor)):
+                return False
+        return True
+
+    return False
+
+
+def _transformed_from_varbase(obj):
+    # In paddle2.1 version, VarBase is saved as tuple(tensor.name, tensor.numpy()).
+    # When executing paddle.load, use this function to determine whether to restore to VarBase/LoDTensor.
+    if isinstance(obj, tuple) and len(obj) == 2:
+        if six.PY2:
+            name_types = (str, unicode)
+        else:
+            name_types = str
+        if isinstance(obj[0], name_types) and isinstance(obj[1], np.ndarray):
+            return True
+    return False
+
+
+def _transformed_from_lodtensor(obj):
+    # In paddle2.1 version, LoDTensor is saved as np.array(tensor).
+    # When executing paddle.load, use this function to determine whether to restore to VarBase/LoDTensor.
+    if isinstance(obj, np.ndarray):
+        return True
+    return False
+
+
+def _to_LodTensor(ndarray):
+    if not isinstance(ndarray, np.ndarray):
+        raise TypeError(
+            'Type of `ndarray` should be numpy.ndarray, but received {}.'.
+            format(type(ndarray)))
+    t = core.LoDTensor()
+    place = _current_expected_place()
+    t.set(ndarray, place)
+    return t
+
+
+def _tuple_to_tensor(obj, return_numpy):
+    if return_numpy:
+        return obj[1]
+    if in_dygraph_mode():
+        t = paddle.to_tensor(obj[1])
+        # This function does modify the name of return value.
+        # Loading the same variable multiple times may cause the same name.
+        t.name = obj[0]
+        return t
+    else:
+        return _to_LodTensor(obj[1])
+
+
+def _ndarray_to_tensor(obj, return_numpy):
+    if return_numpy:
+        return obj
+    if in_dygraph_mode():
+        return paddle.to_tensor(obj)
+    else:
+        return _to_LodTensor(obj)
+
+
+def _lod_tensor2varbase(tensor):
+    return_var = _varbase_creator()
+    return_var.value().get_tensor().set(tensor, _current_expected_place())
+    return return_var
+
+
+def _parse_every_object(obj, condition_func, convert_func):
+    if condition_func(obj):
+        return convert_func(obj)
+    elif type(obj) in (dict, collections.OrderedDict, list):
+        if type(obj) == list:
+            keys = range(len(obj))
+        else:
+            keys = list(obj.keys())
+        for key in keys:
+            if condition_func(obj[key]):
+                obj[key] = convert_func(obj[key])
+            else:
+                obj[key] = _parse_every_object(obj[key], condition_func,
+                                               convert_func)
+        return obj
+    elif type(obj) == tuple:
+        return tuple(
+            _parse_every_object(list(obj), condition_func, convert_func))
+    elif type(obj) == set:
+        return set(_parse_every_object(list(obj), condition_func, convert_func))
+    else:
+        if isinstance(obj, collections.Iterable) and not isinstance(obj, (
+                str, np.ndarray, core.VarBase, core.LoDTensor)):
+            raise NotImplementedError(
+                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".
+                format(type(obj)))
+        return obj
+
+
+def _parse_load_result(obj, return_numpy):
+    def is_layer(obj):
+        return isinstance(obj, core.Layer)
+
+    def parse_layer(obj):
+        temp_dict = _parse_load_result(obj.__dict__, False)
+        obj.__dict__.update(temp_dict)
+        return obj
+
+    if _contain_x(obj, is_layer):
+        if not in_dygraph_mode():
+            raise ValueError(
+                "Layer can only be loaded in dynamic graph mode, but now in static graph mode."
+            )
+
+        _parse_every_object(obj, is_layer, parse_layer)
+
+    def tuple_to_tensor(obj):
+        return _tuple_to_tensor(obj, return_numpy=return_numpy)
+
+    def ndarray_to_tensor(obj):
+        return _ndarray_to_tensor(obj, return_numpy=return_numpy)
+
+    # tuple(name, ndarry) was converted from varbase of paddle2.1, 
+    # and all tuple(name, ndarry) are converted to tensor.
+    if _contain_x(obj, _transformed_from_varbase):
+        return _parse_every_object(obj, _transformed_from_varbase,
+                                   tuple_to_tensor)
+    # If there is no tuple(name, ndary), it is considered to be saved by paddle2.0 
+    # or converted from LoDTensor, and all ndarrays are converted to tensor.
+    else:
+        return _parse_every_object(obj, _transformed_from_lodtensor,
+                                   ndarray_to_tensor)
+
+
+def _save_lod_tensor(tensor, file_name):
+    if not tensor._is_initialized():
+        raise ValueError("The saved tensor is not initialized.")
+    _seek = core._save_lod_tensor(tensor, file_name)
+    # '_seek' is the end position of this tensor in the file.
+    return _seek
+
+
+def _load_lod_tensor(file_name):
+    temp_t = paddle.fluid.core.LoDTensor()
+    # '_seek' is the end position of this tensor in the file.
+    _seek = paddle.fluid.core._load_lod_tensor(temp_t, file_name)
+    return temp_t, _seek
+
+
+def _save_selected_rows(selected_rows, file_name):
+    # '_seek' is the end position of this SelectedRows in the file.
+    if not selected_rows.get_tensor()._is_initialized():
+        raise ValueError("The saved tensor is not initialized.")
+    _seek = core._save_selected_rows(selected_rows, file_name)
+    return _seek
+
+
+def _load_selected_rows(file_name):
+    temp_sr = core.SelectedRows()
+    # '_seek' is the end position of this SelectedRows in the file.
+    _seek = core._load_selected_rows(temp_sr, file_name)
+    return temp_sr, _seek
+
+
+def _save_binary_var(obj, path):
+    if isinstance(obj, core.LoDTensor):
+        _save_lod_tensor(obj, path)
+    elif isinstance(obj, core.SelectedRows):
+        _save_selected_rows(obj, path)
+    elif isinstance(obj, core.VarBase):
+        _save_lod_tensor(obj.value().get_tensor(), path)
+    else:
+        # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'
+        raise NotImplementedError(
+            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}.".
+            format(type(obj)))
+
+
+def save(obj, path, protocol=2, **configs):
     '''
     Save an object to the specified path.
     
     .. note::
-        Now only supports save ``state_dict`` of Layer or Optimizer.
+        Now supports saving ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor.
 
     .. note::
         Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
@@ -219,8 +511,12 @@ def save(obj, path, pickle_protocol=2):
         obj(Object) : The object to be saved.
         path(str) : The path of the object to be saved. 
           If saved in the current directory, the input path string will be used as the file name. 
-        pickle_protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
+        protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 2
+        **configs(dict, optional): optional keyword arguments. The following options are currently supported:
+          use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``. 
+          If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format.
+          Default: False
 
     Returns:
         None
@@ -228,20 +524,98 @@ def save(obj, path, pickle_protocol=2):
     Examples:
         .. code-block:: python
 
+            # example 1: dynamic graph
             import paddle
-
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
+
+            # save state_dict of emb
             paddle.save(layer_state_dict, "emb.pdparams")
-            scheduler = paddle.optimizer.lr.NoamDecay(	
+
+            scheduler = paddle.optimizer.lr.NoamDecay(
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
                 parameters=emb.parameters())
             opt_state_dict = adam.state_dict()
+
+            # save state_dict of optimizer
             paddle.save(opt_state_dict, "adam.pdopt")
+            # save weight of emb
+            paddle.save(emb.weight, "emb.weight.pdtensor")
+
+            # example 2: static graph
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            # create network
+            x = paddle.static.data(name="x", shape=[None, 224], dtype='float32')
+            z = paddle.static.nn.fc(x, 10)
+
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            for var in prog.list_vars():
+                if list(var.shape) == [224, 10]:
+                    tensor = var.get_value()
+                    break
+
+            # save/load tensor
+            path_tensor = 'temp/tensor.pdtensor'
+            paddle.save(tensor, path_tensor)
+
+            # save/load state_dict
+            path_state_dict = 'temp/model.pdparams'
+            paddle.save(prog.state_dict("param"), path_tensor)
     '''
+    # 1. input check
+    filename = os.path.basename(path)
+    if filename == "":
+        raise ValueError("The input path MUST be format of dirname/filename "
+                         "[dirname\\filename in Windows system], but received "
+                         "filename is empty string.")
+
+    # 2. save object
+    dirname = os.path.dirname(path)
+    if dirname and not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    config = _parse_save_config(configs)
+
+    if not isinstance(config.use_binary_format, bool):
+        raise TypeError(
+            "Type of `use_binary_format` should be bool, but received {}.".
+            format(type(config.use_binary_format)))
+
+    if config.use_binary_format:
+        _save_binary_var(obj, path)
+    else:
+        # `protocol` need to be used, `pickle_protocol` is a deprecated arg.
+        if config.pickle_protocol is not None:
+            protocol = config.pickle_protocol
+            warnings.warn(
+                "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+            )
+
+        if isinstance(obj, Program):
+            obj.desc.flush()
+            with open(path, "wb") as f:
+                f.write(obj.desc.serialize_to_string())
+
+        elif _is_state_dict(obj):
+            if in_dygraph_mode():
+                _legacy_save(obj, path, protocol)
+            else:
+                _legacy_static_save(obj, path, protocol)
+        else:
+            with open(path, 'wb') as f:
+                _pickle_save(obj, f, protocol)
+
 
+def _legacy_save(obj, path, protocol=2):
     # 1. input check
     if not isinstance(obj, dict):
         raise NotImplementedError(
@@ -257,13 +631,13 @@ def save(obj, path, pickle_protocol=2):
                          "[dirname\\filename in Windows system], but received "
                          "filename is empty string.")
 
-    if not isinstance(pickle_protocol, int):
+    if not isinstance(protocol, int):
         raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(pickle_protocol)))
+            type(protocol)))
 
-    if pickle_protocol < 2 or pickle_protocol > 4:
+    if protocol < 2 or protocol > 4:
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
-                         format(pickle_protocol))
+                         format(protocol))
 
     # 2. save object
     dirname = os.path.dirname(path)
@@ -274,19 +648,18 @@ def save(obj, path, pickle_protocol=2):
     if isinstance(obj, dict):
         saved_obj = _build_saved_state_dict(obj)
 
-    saved_obj = _unpack_saved_dict(saved_obj, pickle_protocol)
+    saved_obj = _unpack_saved_dict(saved_obj, protocol)
 
-    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3.5/6'
-    if sys.platform == 'darwin' and sys.version_info.major == 3 and (
-            sys.version_info.minor == 5 or sys.version_info.minor == 6):
-        pickle_bytes = pickle.dumps(saved_obj, protocol=pickle_protocol)
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(saved_obj, protocol=protocol)
         with open(path, 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
         with open(path, 'wb') as f:
-            pickle.dump(saved_obj, f, protocol=pickle_protocol)
+            pickle.dump(saved_obj, f, protocol=protocol)
 
 
 def load(path, **configs):
@@ -294,7 +667,7 @@ def load(path, **configs):
     Load an object can be used in paddle from specified path.
 
     .. note::
-        Now only supports load ``state_dict`` of Layer or Optimizer.
+        Now supports loading ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor.
 
     .. note::
         In order to use the model parameters saved by paddle more efficiently, 
@@ -331,7 +704,9 @@ def load(path, **configs):
             ``save_inference_model`` save format. Default file name is :code:`__model__` . 
             (2) params_filename (str): The persistable variables file name of the paddle 1.x 
             ``save_inference_model`` save format. No default file name, save variables separately 
-            by default.
+            by default.            
+            (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor. 
+            Default False.
 
     Returns:
         Object(Object): a target object can be used in paddle
@@ -341,20 +716,133 @@ def load(path, **configs):
 
             import paddle
 
+            # example 1: dynamic graph
+            import paddle
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
+
+            # save state_dict of emb
             paddle.save(layer_state_dict, "emb.pdparams")
-            scheduler = paddle.optimizer.lr.NoamDecay(	
+
+            scheduler = paddle.optimizer.lr.NoamDecay(
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
                 parameters=emb.parameters())
             opt_state_dict = adam.state_dict()
+
+            # save state_dict of optimizer
             paddle.save(opt_state_dict, "adam.pdopt")
+            # save weight of emb
+            paddle.save(emb.weight, "emb.weight.pdtensor")
 
+            # load state_dict of emb
             load_layer_state_dict = paddle.load("emb.pdparams")
+            # load state_dict of optimizer
             load_opt_state_dict = paddle.load("adam.pdopt")
+            # load weight of emb
+            load_weight = paddle.load("emb.weight.pdtensor")
+
+
+            # example 2: static graph
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            # create network
+            x = paddle.static.data(name="x", shape=[None, 224], dtype='float32')
+            z = paddle.static.nn.fc(x, 10)
+
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            for var in prog.list_vars():
+                if list(var.shape) == [224, 10]:
+                    tensor = var.get_value()
+                    break
+
+            # save/load tensor
+            path_tensor = 'temp/tensor.pdtensor'
+            paddle.save(tensor, path_tensor)
+            load_tensor = paddle.load(path_tensor)
+
+            # save/load state_dict
+            path_state_dict = 'temp/model.pdparams'
+            paddle.save(prog.state_dict("param"), path_tensor)
+            load_state_dict = paddle.load(path_tensor)
+
     '''
+
+    if os.path.isfile(path):
+        config = _parse_load_config(configs)
+        if six.PY2:
+            exception_type = KeyError
+        else:
+            exception_type = pickle.UnpicklingError
+        try:
+            with open(path, 'rb') as f:
+                # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+                if sys.platform == 'darwin' and sys.version_info.major == 3:
+                    load_result = _pickle_loads_mac(path, f)
+                else:
+                    load_result = pickle.load(f) if six.PY2 else pickle.load(
+                        f, encoding='latin1')
+
+                # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
+                if isinstance(load_result, dict):
+                    load_result = _pack_loaded_dict(load_result)
+                    # paddle2.0: paddle.save/load
+                    if "StructuredToParameterName@@" in load_result:
+
+                        for key in load_result["StructuredToParameterName@@"]:
+                            load_result[key] = _ndarray_to_tensor(
+                                load_result[key], config.return_numpy)
+
+                        if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+                            del load_result["StructuredToParameterName@@"]
+                    else:
+                        # paddle2.1 static.save/load
+                        load_result = _parse_load_result(load_result,
+                                                         config.return_numpy)
+
+                else:
+                    load_result = _parse_load_result(load_result,
+                                                     config.return_numpy)
+
+        except exception_type as msg_pickle:
+            try:
+                tensor, _ = _load_selected_rows(path)
+                return tensor
+            except:
+                try:
+                    tensor, _ = _load_lod_tensor(path)
+                    if config.return_numpy:
+                        return np.array(tensor)
+                    else:
+                        if in_dygraph_mode():
+                            return _lod_tensor2varbase(tensor)
+                        return tensor
+                except:
+                    try:
+                        with open(path, "rb") as f:
+                            program_desc_str = f.read()
+                            program = Program.parse_from_string(
+                                program_desc_str)
+                            return program
+                    except:
+                        raise ValueError(
+                            "`paddle.load` can not parse the file:{}.".format(
+                                path))
+
+    else:
+        load_result = _legacy_load(path, **configs)
+
+    return load_result
+
+
+def _legacy_load(path, **configs):
     load_result = None
     config = _parse_load_config(configs)
 
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 1624a069a51eca24926bbaa030e43671e176ca00..701f8b5352c3d41337dde1463e37085aeeb21178 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -16,7 +16,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 
-__all__ = ['seed', 'get_cuda_rng_state', 'set_cuda_rng_state']
+__all__ = []
 
 
 def seed(seed):
@@ -83,7 +83,7 @@ def set_cuda_rng_state(state_list):
     Sets generator state for all cuda generators
 
     Args:
-        state_list(list): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
+        state_list(list|tuple): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
 
     Returns:
         None
diff --git a/python/paddle/hapi/__init__.py b/python/paddle/hapi/__init__.py
index 0aea557a28c274398f3e5d6422eb2141778bf9ce..2829bbe94708981f30ffcfdeeff89fd85899e33b 100644
--- a/python/paddle/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import logger
-from . import callbacks
-from . import model_summary
+from . import logger  # noqa: F401
+from . import callbacks  # noqa: F401
+from . import hub  # noqa: F401
+from . import progressbar  # noqa: F401
+from . import static_flops  # noqa: F401
 
-from . import model
-from .model import *
-from .model_summary import summary
-from .dynamic_flops import flops
+from .model import Model  # noqa: F401
+from .model_summary import summary  # noqa: F401
+from .dynamic_flops import flops  # noqa: F401
 
 logger.setup_logger()
 
-__all__ = ['callbacks'] + model.__all__ + ['summary']
-__all__ = model.__all__ + ['flops']
+__all__ = []
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index ac95fea151ed01e06369511d5f8cba684004bb41..61ae8b42d63a909cc4dc88d4b16f0b0e8ed83c71 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -25,10 +25,7 @@ from paddle.utils import try_import
 
 from .progressbar import ProgressBar
 
-__all__ = [
-    'Callback', 'ProgBarLogger', 'ModelCheckpoint', 'VisualDL', 'LRScheduler',
-    'EarlyStopping', 'ReduceLROnPlateau'
-]
+__all__ = []
 
 
 def config_callbacks(callbacks=None,
@@ -364,7 +361,7 @@ class ProgBarLogger(Callback):
         }
         if self._is_print():
             print(
-                "The loss value printed in the log is the current step, and the metric is the average value of previous step."
+                "The loss value printed in the log is the current step, and the metric is the average value of previous steps."
             )
 
     def on_epoch_begin(self, epoch=None, logs=None):
diff --git a/python/paddle/hapi/dynamic_flops.py b/python/paddle/hapi/dynamic_flops.py
index 35819d6b7bb555aade60c7114f0ea28c67631ac6..8be6758f1e54b57adadd01025fe9c805556a67d7 100644
--- a/python/paddle/hapi/dynamic_flops.py
+++ b/python/paddle/hapi/dynamic_flops.py
@@ -18,7 +18,7 @@ import paddle.nn as nn
 import numpy as np
 from .static_flops import static_flops, Table
 
-__all__ = ['flops']
+__all__ = []
 
 
 def flops(net, input_size, custom_ops=None, print_detail=False):
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..54765c1d4d41cb80c42690909d1bc743f33cbed0
--- /dev/null
+++ b/python/paddle/hapi/hub.py
@@ -0,0 +1,279 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import sys
+import shutil
+import zipfile
+from paddle.utils.download import get_path_from_url
+
+__all__ = []
+
+DEFAULT_CACHE_DIR = '~/.cache'
+VAR_DEPENDENCY = 'dependencies'
+MODULE_HUBCONF = 'hubconf.py'
+HUB_DIR = os.path.expanduser(os.path.join('~', '.cache', 'paddle', 'hub'))
+
+
+def _remove_if_exists(path):
+    if os.path.exists(path):
+        if os.path.isfile(path):
+            os.remove(path)
+        else:
+            shutil.rmtree(path)
+
+
+def _import_module(name, repo_dir):
+    sys.path.insert(0, repo_dir)
+    try:
+        hub_module = __import__(name)
+        sys.modules.pop(name)
+    except ImportError:
+        sys.path.remove(repo_dir)
+        raise RuntimeError(
+            'Please make sure config exists or repo error messages above fixed when importing'
+        )
+
+    sys.path.remove(repo_dir)
+
+    return hub_module
+
+
+def _git_archive_link(repo_owner, repo_name, branch, source):
+    if source == 'github':
+        return 'https://github.com/{}/{}/archive/{}.zip'.format(
+            repo_owner, repo_name, branch)
+    elif source == 'gitee':
+        return 'https://gitee.com/{}/{}/repository/archive/{}.zip'.format(
+            repo_owner, repo_name, branch)
+
+
+def _parse_repo_info(repo, source):
+    branch = 'main' if source == 'github' else 'master'
+    if ':' in repo:
+        repo_info, branch = repo.split(':')
+    else:
+        repo_info = repo
+    repo_owner, repo_name = repo_info.split('/')
+    return repo_owner, repo_name, branch
+
+
+def _make_dirs(dirname):
+    try:
+        from pathlib import Path
+    except ImportError:
+        from pathlib2 import Path
+    Path(dirname).mkdir(exist_ok=True)
+
+
+def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
+    # Setup hub_dir to save downloaded files
+    hub_dir = HUB_DIR
+
+    _make_dirs(hub_dir)
+
+    # Parse github/gitee repo information
+    repo_owner, repo_name, branch = _parse_repo_info(repo, source)
+    # Github allows branch name with slash '/',
+    # this causes confusion with path on both Linux and Windows.
+    # Backslash is not allowed in Github branch name so no need to
+    # to worry about it.
+    normalized_br = branch.replace('/', '_')
+    # Github renames folder repo/v1.x.x to repo-1.x.x
+    # We don't know the repo name before downloading the zip file
+    # and inspect name from it.
+    # To check if cached repo exists, we need to normalize folder names.
+    repo_dir = os.path.join(hub_dir,
+                            '_'.join([repo_owner, repo_name, normalized_br]))
+
+    use_cache = (not force_reload) and os.path.exists(repo_dir)
+
+    if use_cache:
+        if verbose:
+            sys.stderr.write('Using cache found in {}\n'.format(repo_dir))
+    else:
+        cached_file = os.path.join(hub_dir, normalized_br + '.zip')
+        _remove_if_exists(cached_file)
+
+        url = _git_archive_link(repo_owner, repo_name, branch, source=source)
+
+        get_path_from_url(url, hub_dir, decompress=False)
+
+        with zipfile.ZipFile(cached_file) as cached_zipfile:
+            extraced_repo_name = cached_zipfile.infolist()[0].filename
+            extracted_repo = os.path.join(hub_dir, extraced_repo_name)
+            _remove_if_exists(extracted_repo)
+            # Unzip the code and rename the base folder
+            cached_zipfile.extractall(hub_dir)
+
+        _remove_if_exists(cached_file)
+        _remove_if_exists(repo_dir)
+        # rename the repo
+        shutil.move(extracted_repo, repo_dir)
+
+    return repo_dir
+
+
+def _load_entry_from_hubconf(m, name):
+    '''load entry from hubconf
+    '''
+    if not isinstance(name, str):
+        raise ValueError(
+            'Invalid input: model should be a str of function name')
+
+    func = getattr(m, name, None)
+
+    if func is None or not callable(func):
+        raise RuntimeError('Cannot find callable {} in hubconf'.format(name))
+
+    return func
+
+
+def _check_module_exists(name):
+    try:
+        __import__(name)
+        return True
+    except ImportError:
+        return False
+
+
+def _check_dependencies(m):
+    dependencies = getattr(m, VAR_DEPENDENCY, None)
+
+    if dependencies is not None:
+        missing_deps = [
+            pkg for pkg in dependencies if not _check_module_exists(pkg)
+        ]
+        if len(missing_deps):
+            raise RuntimeError('Missing dependencies: {}'.format(', '.join(
+                missing_deps)))
+
+
+def list(repo_dir, source='github', force_reload=False):
+    r"""
+    List all entrypoints available in `github` hubconf.
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional): whether to discard the existing cache and force a fresh download, default is `False`.
+    Returns:
+        entrypoints: a list of available entrypoint names
+
+    Example:
+        ```python
+        import paddle
+
+        paddle.hub.list('lyuwenyu/paddlehub_demo:main', source='github', force_reload=False)
+
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    entrypoints = [
+        f for f in dir(hub_module)
+        if callable(getattr(hub_module, f)) and not f.startswith('_')
+    ]
+
+    return entrypoints
+
+
+def help(repo_dir, model, source='github', force_reload=False):
+    """
+    Show help information of model
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        model (str): model name
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional): default is `False`
+    Return:
+        docs
+
+    Example:
+        ```python
+        import paddle
+
+        paddle.hub.help('lyuwenyu/paddlehub_demo:main', model='MM', source='github')
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    entry = _load_entry_from_hubconf(hub_module, model)
+
+    return entry.__doc__
+
+
+def load(repo_dir, model, source='github', force_reload=False, **kwargs):
+    """
+    Load model
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        model (str): model name
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional), default is `False`
+        **kwargs: parameters using for model
+    Return:
+        paddle model
+    Example:
+        ```python
+        import paddle
+        paddle.hub.load('lyuwenyu/paddlehub_demo:main', model='MM', source='github')
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    _check_dependencies(hub_module)
+
+    entry = _load_entry_from_hubconf(hub_module, model)
+
+    return entry(**kwargs)
diff --git a/python/paddle/hapi/logger.py b/python/paddle/hapi/logger.py
index d4f18ce0ff738c966f1e237beffc9da366e3ae64..ea515d95324675b071f810c813fcf0849490d007 100644
--- a/python/paddle/hapi/logger.py
+++ b/python/paddle/hapi/logger.py
@@ -22,6 +22,8 @@ import logging
 
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
+__all__ = []
+
 
 def setup_logger(output=None, name="hapi", log_level=logging.INFO):
     """
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 137ca186d7946a426b263b6b902e101be4744135..160d6c54759d901e2529221c99dce63b29f06810 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -41,8 +41,6 @@ from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTra
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.layers import collective
-from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
-from paddle.fluid.incubate.fleet.base import role_maker
 
 from paddle.io import DataLoader, Dataset, DistributedBatchSampler
 from paddle.fluid.executor import scope_guard, Executor
@@ -50,11 +48,13 @@ from paddle.fluid.dygraph.layers import Layer
 from paddle.metric import Metric
 from paddle.static import InputSpec as Input
 import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from paddle.distributed.fleet.base import role_maker
 
 from .callbacks import config_callbacks, EarlyStopping
 from .model_summary import summary
 
-__all__ = ['Model', ]
+__all__ = []
 
 _parallel_context_initialized = False
 
@@ -133,33 +133,59 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
         return
     other_endpoints = endpoints[:]
     other_endpoints.remove(current_endpoint)
+    block = program.global_block()
     if rank == 0 and wait_port:
         wait_server_ready(other_endpoints)
-    block = program.global_block()
-    nccl_id_var = block.create_var(
-        name=fluid.unique_name.generate('nccl_id'),
-        persistable=True,
-        type=fluid.core.VarDesc.VarType.RAW)
-
-    block.append_op(
-        type='c_gen_nccl_id',
-        inputs={},
-        outputs={'Out': nccl_id_var},
-        attrs={
-            'rank': rank,
-            'endpoint': current_endpoint,
-            'other_endpoints': other_endpoints
-        })
-
-    block.append_op(
-        type='c_comm_init',
-        inputs={'X': nccl_id_var},
-        outputs={},
-        attrs={
-            'nranks': nranks,
-            'rank': rank,
-            'ring_id': 0,
-        })
+    if core.is_compiled_with_cuda():
+        nccl_id_var = block.create_var(
+            name=fluid.unique_name.generate('nccl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': 0,
+            })
+    elif core.is_compiled_with_npu():
+        hccl_id_var = block.create_var(
+            name=unique_name.generate('hccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+        block.append_op(
+            type='c_gen_hccl_id',
+            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
+            outputs={},
+            attrs={
+                'rank': rank,
+                'ring_id': 0,
+                'device_id': int(os.getenv("FLAGS_selected_npus")),
+                'rank_ids': nranks
+            })
 
 
 def prepare_distributed_context(place=None):
@@ -210,7 +236,7 @@ def _update_input_info(inputs):
     if isinstance(inputs, Input):
         shapes = [list(inputs.shape)]
         dtypes = [inputs.dtype]
-    elif isinstance(inputs, list):
+    elif isinstance(inputs, (list, tuple)):
         shapes = [list(input.shape) for input in inputs]
         dtypes = [input.dtype for input in inputs]
     elif isinstance(inputs, dict):
@@ -252,6 +278,11 @@ class StaticGraphAdapter(object):
         self._nranks = ParallelEnv().nranks
         self._local_rank = ParallelEnv().local_rank
 
+        self._amp_level = "O0"
+        self._amp_configs = {}
+        self._amp_custom_lists = {}
+        self._use_fp16_guard = True
+
     @property
     def mode(self):
         return self.model.mode
@@ -550,11 +581,26 @@ class StaticGraphAdapter(object):
                 if self._nranks > 1:
                     role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                     fleet.init(role)
-                    dist_strategy = DistributedStrategy()
-                    dist_strategy.mode = "collective"
-                    dist_strategy.collective_mode = "grad_allreduce"
+                    dist_strategy = fleet.DistributedStrategy()
+                    if self._amp_level != 'O0':
+                        dist_strategy.amp = True
+                        dist_strategy.amp_configs = self._amp_configs.copy()
+                        dist_strategy.amp_configs.update(self._amp_custom_lists)
+                        dist_strategy.amp_configs[
+                            'use_pure_fp16'] = self._amp_level == 'O2'
                     self.model._optimizer = fleet.distributed_optimizer(
                         self.model._optimizer, strategy=dist_strategy)
+                elif self._amp_level != "O0" and core.is_compiled_with_cuda:
+                    amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
+                        **self.
+                        _amp_custom_lists) if self._amp_custom_lists else None
+
+                    self.model._optimizer = paddle.static.amp.decorate(
+                        self.model._optimizer,
+                        amp_lists=amp_lists,
+                        use_pure_fp16=self._amp_level == "O2",
+                        use_fp16_guard=self._use_fp16_guard,
+                        **self._amp_configs)
 
                 self.model._optimizer.minimize(self._loss_endpoint)
 
@@ -598,6 +644,10 @@ class StaticGraphAdapter(object):
                 startup_prog = self._startup_prog._prune(uninitialized)
                 self._executor.run(startup_prog)
 
+        if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
+        ):
+            self.model._optimizer.amp_init(place)
+
         if self._nranks < 2:
             compiled_prog = fluid.CompiledProgram(prog)
         else:
@@ -620,6 +670,11 @@ class DynamicGraphAdapter(object):
         }
 
         self._input_info = None
+        self._amp_level = "O0"
+        self._amp_configs = {}
+        self._amp_custom_lists = {}
+        self._use_fp16_guard = True
+
         if self._nranks > 1:
             dist.init_parallel_env()
             stradegy = fluid.dygraph.parallel.ParallelStrategy()
@@ -649,19 +704,30 @@ class DynamicGraphAdapter(object):
         labels = labels or []
         labels = [to_variable(l) for l in to_list(labels)]
 
-        if self._nranks > 1:
-            outputs = self.ddp_model.forward(* [to_variable(x) for x in inputs])
-        else:
-            outputs = self.model.network.forward(
-                * [to_variable(x) for x in inputs])
+        if self._amp_level != "O0":
+            scaler = paddle.amp.GradScaler(**self._amp_configs)
+        with paddle.amp.auto_cast(
+                enable=self._amp_level != 'O0', **self._amp_custom_lists):
+            if self._nranks > 1:
+                outputs = self.ddp_model.forward(
+                    * [to_variable(x) for x in inputs])
+            else:
+                outputs = self.model.network.forward(
+                    * [to_variable(x) for x in inputs])
 
-        losses = self.model._loss(*(to_list(outputs) + labels))
-        losses = to_list(losses)
-        final_loss = fluid.layers.sum(losses)
-        final_loss.backward()
+            losses = self.model._loss(*(to_list(outputs) + labels))
+            losses = to_list(losses)
+            final_loss = fluid.layers.sum(losses)
 
-        self.model._optimizer.minimize(final_loss)
-        self.model.network.clear_gradients()
+        if self._amp_level != "O0":
+            scaled = scaler.scale(final_loss)
+            scaled.backward()
+            scaler.minimize(self.model._optimizer, scaled)
+            self.model.network.clear_gradients()
+        else:
+            final_loss.backward()
+            self.model._optimizer.minimize(final_loss)
+            self.model.network.clear_gradients()
 
         metrics = []
         for metric in self.model._metrics:
@@ -816,20 +882,32 @@ class Model(object):
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
     must be required for static graph.
 
+    When training on GPU, auto mixed precision (AMP) training is supported, and
+    pure float16 training is also supported in static mode while using Adam,
+    AdamW and Momentum optimizer. Before using pure float16 training,
+    `multi_precision` could be set to True when creating optimizer, which can
+    avoid poor accuracy or slow convergence in a way, and inputs of dtype float
+    should be cast to float16 by users. `paddle.static.amp.fp16_guard` API
+    should be also used to limit the range of pure float16 training, otherwise,
+    'use_fp16_guard' should be set to False by users. However, limiting the
+    range of is not supported during training using AMP.
+
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
-        inputs (InputSpec|list|dict|None): `inputs`, entry points of network,
-            could be a InputSpec instance, or lits of InputSpec instances,
+        inputs (InputSpec|list|tuple|dict|None): `inputs`, entry points of network,
+            could be a InputSpec instance, or list/tuple of InputSpec instances,
             or dict ({name: InputSpec}), and it couldn't be None in static
             graph.
-        labels (InputSpec|list|None): `labels`, entry points of network,
-            could be a InputSpec instnace or lits of InputSpec instances,
+        labels (InputSpec|list|tuple|None): `labels`, entry points of network,
+            could be a InputSpec instnace or list/tuple of InputSpec instances,
             or None. For static graph, if labels is required in loss,
             labels must be set. Otherwise, it could be None.
 
 
     Examples:
+        1. A common example
+
         .. code-block:: python
 
           import paddle
@@ -838,7 +916,7 @@ class Model(object):
           from paddle.static import InputSpec
   
           device = paddle.set_device('cpu') # or 'gpu'
-  
+
           net = nn.Sequential(
               nn.Flatten(1),
               nn.Linear(784, 200),
@@ -852,6 +930,7 @@ class Model(object):
           model = paddle.Model(net, input, label)
           optim = paddle.optimizer.SGD(learning_rate=1e-3,
               parameters=model.parameters())
+
           model.prepare(optim,
                         paddle.nn.CrossEntropyLoss(),
                         paddle.metric.Accuracy())
@@ -862,6 +941,43 @@ class Model(object):
           ])
           data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
           model.fit(data, epochs=2, batch_size=32, verbose=1)
+
+
+        2. An example using mixed precision training.
+
+        .. code-block:: python
+
+          import paddle
+          import paddle.nn as nn
+          import paddle.vision.transforms as T
+
+          def run_example_code():
+            device = paddle.set_device('gpu')
+
+            net = nn.Sequential(nn.Flatten(1), nn.Linear(784, 200), nn.Tanh(),
+                                nn.Linear(200, 10))
+
+            model = paddle.Model(net)
+            optim = paddle.optimizer.SGD(learning_rate=1e-3, parameters=model.parameters())
+
+            amp_configs = {
+                "level": "O1",
+                "custom_white_list": {'conv2d'},
+                "use_dynamic_loss_scaling": True
+            }
+            model.prepare(optim,
+                paddle.nn.CrossEntropyLoss(),
+                paddle.metric.Accuracy(),
+                amp_configs=amp_configs)
+
+            transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
+            data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
+            model.fit(data, epochs=2, batch_size=32, verbose=1)
+
+          # mixed precision training is only supported on GPU now.
+          if paddle.is_compiled_with_cuda():
+            run_example_code()
+
     """
 
     def __init__(self, network, inputs=None, labels=None):
@@ -878,9 +994,10 @@ class Model(object):
         self.stop_training = False
 
         if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
+            if not isinstance(inputs, (list, tuple, dict, Input)):
                 raise TypeError(
-                    "'inputs' must be list or dict, and couldn't be None.")
+                    "'inputs' must be list or tuple or dict, and couldn't be None."
+                )
         elif inputs:
             self._input_info = _update_input_info(inputs)
 
@@ -1241,7 +1358,94 @@ class Model(object):
         """
         return self._adapter.parameters()
 
-    def prepare(self, optimizer=None, loss=None, metrics=None):
+    def _prepare_amp(self, amp_configs):
+        def _check_pure_fp16_configs():
+            # pure float16 training has some restricts now
+            if self._adapter._amp_level == "O2":
+                if in_dygraph_mode():
+                    warnings.warn("Pure float16 training is not supported in dygraph mode now, "\
+                        "and it will be supported in future version.")
+                else:
+                    # grad clip is not supported in pure fp16 training now
+                    assert self._optimizer._grad_clip is None, \
+                        "Grad clip is not supported in pure float16 training now, and it will be supported in future version."
+
+        self._adapter._amp_custom_lists = {}
+        self._adapter._amp_configs = {}
+
+        # check and get level of mixed precision training
+        if not amp_configs:
+            self._adapter._amp_level = 'O0'
+            return
+        elif isinstance(amp_configs, str):
+            if amp_configs not in ('O0', 'O1', 'O2'):
+                raise ValueError(
+                    "The level of amp_configs should be 'O0', 'O1' or 'O2'.")
+            self._adapter._amp_level = amp_configs
+            _check_pure_fp16_configs()
+            return
+        else:
+            if 'level' not in amp_configs:
+                self._adapter._amp_level = 'O1'
+            elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
+                raise ValueError(
+                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
+            else:
+                self._adapter._amp_level = amp_configs['level']
+        amp_config_key_set = set(amp_configs.keys()) - {'level'}
+        if not amp_config_key_set or self._adapter._amp_level == 'O0':
+            return
+
+        if 'use_pure_fp16' in amp_configs:
+            raise ValueError(
+                "''use_pure_fp16' is an invalid parameter, "
+                "the level of mixed precision training only depends on 'O1' or 'O2'."
+            )
+
+        _check_pure_fp16_configs()
+
+        # construct amp_custom_lists
+        if self._adapter._amp_level != 'O0' and amp_config_key_set:
+            for param_name in [
+                    'custom_white_list', 'custom_black_list',
+                    'custom_black_varnames'
+            ]:
+                if param_name in amp_config_key_set:
+                    self._adapter._amp_custom_lists[param_name] = amp_configs[
+                        param_name]
+                    amp_config_key_set -= {param_name}
+
+        def _check_amp_configs(amp_config_key_set):
+            accepted_param_set = {
+                'init_loss_scaling',
+                'incr_ratio',
+                'decr_ratio',
+                'incr_every_n_steps',
+                'decr_every_n_nan_or_inf',
+                'use_dynamic_loss_scaling',
+                'use_fp16_guard',
+            }
+            if amp_config_key_set - accepted_param_set:
+                raise ValueError(
+                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, "
+                    "but {} could not be recognized.".format(
+                        tuple(amp_config_key_set - accepted_param_set)))
+
+            if 'use_fp16_guard' in amp_config_key_set:
+                if in_dygraph_mode():
+                    raise ValueError(
+                        "'use_fp16_guard' is supported in static mode only.")
+                self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
+                amp_config_key_set.remove('use_fp16_guard')
+
+            return amp_config_key_set
+
+        amp_configs_set = _check_amp_configs(amp_config_key_set)
+        for key in amp_configs_set:
+            self._adapter._amp_configs[key] = amp_configs[key]
+
+    def prepare(self, optimizer=None, loss=None, metrics=None,
+                amp_configs=None):
         """
         Configures the model before runing.
 
@@ -1255,7 +1459,22 @@ class Model(object):
                 It can be None when there is no loss.
             metrics (Metric|list of Metric|None): If metrics is set, all
                 metrics will be calculated and output in train/eval mode.
-
+            amp_configs (str|dict|None): AMP configurations. If AMP or pure
+                float16 training is used, the key 'level' of 'amp_configs'
+                should be set to 'O1' or 'O2' respectively. Otherwise, the
+                value of 'level' defaults to 'O0', which means float32
+                training. In addition to 'level', parameters consistent with
+                mixed precision API could also be passed in. The supported
+                keys are: 'init_loss_scaling', 'incr_ratio', 'decr_ratio',
+                'incr_every_n_steps', 'decr_every_n_nan_or_inf',
+                'use_dynamic_loss_scaling', 'custom_white_list',
+                'custom_black_list', and 'custom_black_varnames'or
+                'use_fp16_guard' is only supported in static mode. Mixed
+                precision API documentations  :ref:`api_paddle_amp_auto_cast`
+                and  :ref:`api_paddle_amp_GradScaler` could be referenced
+                for details. For convenience, 'amp_configs' could be set to
+                'O1' or 'O2' if no more parameters are needed. 'amp_configs'
+                could be None in float32 training. Default: None.
         Returns:
             None
         """
@@ -1292,6 +1511,7 @@ class Model(object):
                 "{} is not sub class of Metric".format(
                     metric.__class__.__name__)
         self._metrics = to_list(metrics)
+        self._prepare_amp(amp_configs)
 
         if not in_dygraph_mode():
             self._adapter.prepare()
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index babbe962a952528f462ae4e879a0d47c69550755..d78196d94451ed38525c27f729b34e73761e92b3 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -22,7 +22,7 @@ from paddle.static import InputSpec
 
 from collections import OrderedDict
 
-__all__ = ['summary']
+__all__ = []
 
 
 def summary(net, input_size, dtypes=None):
@@ -341,10 +341,12 @@ def summary_string(model, input_size, dtypes=None):
         total_params += summary[layer]["nb_params"]
 
         try:
-            total_output += np.prod(summary[layer]["output_shape"])
+            total_output += np.sum(
+                np.prod(
+                    summary[layer]["output_shape"], axis=-1))
         except:
             for output_shape in summary[layer]["output_shape"]:
-                total_output += np.prod(output_shape)
+                total_output += np.sum(np.prod(output_shape, axis=-1))
 
         if "trainable" in summary[layer]:
             if summary[layer]["trainable"] == True:
diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
index cf5a03ed4982b41bf5ea6c8f343e873e0695ea15..5f63a3169f8ac7a09283bed98cc6f96d1193064b 100644
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -22,7 +22,7 @@ import time
 import numpy as np
 from collections import namedtuple
 
-__all__ = ['ProgressBar']
+__all__ = []
 
 
 class ProgressBar(object):
diff --git a/python/paddle/hapi/static_flops.py b/python/paddle/hapi/static_flops.py
index 3656e0c18945a6665292ceeb3f167d79505b523e..07fc19b2cb89a588d103140a48f85d3f147d6aa9 100644
--- a/python/paddle/hapi/static_flops.py
+++ b/python/paddle/hapi/static_flops.py
@@ -18,6 +18,8 @@ import paddle
 from collections import OrderedDict
 from paddle.static import Program, program_guard, Variable
 
+__all__ = []
+
 
 class VarWrapper(object):
     def __init__(self, var, graph):
diff --git a/python/paddle/hub.py b/python/paddle/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..acdb28cb6f08dfd51e9770c40283eb3f8d98a010
--- /dev/null
+++ b/python/paddle/hub.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .hapi.hub import list  # noqa: F401
+from .hapi.hub import help  # noqa: F401
+from .hapi.hub import load  # noqa: F401
+
+__all__ = [  #noqa
+    'list', 'help', 'load'
+]
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index c422bacdf78c7a7159f02d8401243dde181cacfe..03e5a88624086b8781a1d8bee4437d9a17c98f76 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 
 from . import optimizer
-from ..fluid.contrib import reader
-from ..fluid import load_op_library
+from . import checkpoint
 from ..fluid.layer_helper import LayerHelper
 
 __all__ = []
-__all__ += ["reader"]
 __all__ += optimizer.__all__
+__all__ += checkpoint.__all__
diff --git a/python/paddle/incubate/checkpoint/__init__.py b/python/paddle/incubate/checkpoint/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ddd256df747981019e3afb0bb1dd839cf3ea550
--- /dev/null
+++ b/python/paddle/incubate/checkpoint/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.incubate.checkpoint import auto_checkpoint
+
+__all__ = ["auto_checkpoint"]
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 59e2729941e4168139ec9f0cb7856e38d0548f0f..5781f78c6e4e4ad487e8a53e62e4c134206b6945 100755
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -13,26 +13,37 @@
 # limitations under the License.
 
 # TODO: define all functions about input & output in this directory 
-__all__ = [
-    'Dataset',
-    'IterableDataset',
-    'TensorDataset',
-    'ComposeDataset',
-    'ChainDataset',
-    'BatchSampler',
-    'DistributedBatchSampler',
-    #            'Transform',
-    'DataLoader',
-    'get_worker_info',
-    'Sampler',
-    'SequenceSampler',
-    'RandomSampler',
-    'WeightedRandomSampler',
-    'random_split',
-    'Subset'
-]
 
-from ..fluid.io import DataLoader
-from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
-        TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler, \
-        ComposeDataset, ChainDataset, WeightedRandomSampler, Subset, random_split
+from ..fluid.io import DataLoader  # noqa: F401
+from ..fluid.dataloader import Dataset  # noqa: F401
+from ..fluid.dataloader import IterableDataset  # noqa: F401
+from ..fluid.dataloader import BatchSampler  # noqa: F401
+from ..fluid.dataloader import get_worker_info  # noqa: F401
+from ..fluid.dataloader import TensorDataset  # noqa: F401
+from ..fluid.dataloader import Sampler  # noqa: F401
+from ..fluid.dataloader import SequenceSampler  # noqa: F401
+from ..fluid.dataloader import RandomSampler  # noqa: F401
+from ..fluid.dataloader import DistributedBatchSampler  # noqa: F401
+from ..fluid.dataloader import ComposeDataset  # noqa: F401
+from ..fluid.dataloader import ChainDataset  # noqa: F401
+from ..fluid.dataloader import WeightedRandomSampler  # noqa: F401
+from ..fluid.dataloader import Subset  # noqa: F401
+from ..fluid.dataloader import random_split  # noqa: F401
+
+__all__ = [ #noqa
+           'Dataset',
+           'IterableDataset',
+           'TensorDataset',
+           'ComposeDataset',
+           'ChainDataset',
+           'BatchSampler',
+           'DistributedBatchSampler',
+           'DataLoader',
+           'get_worker_info',
+           'Sampler',
+           'SequenceSampler',
+           'RandomSampler',
+           'WeightedRandomSampler',
+           'random_split',
+           'Subset'
+]
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 0784775b6695eefb091bf0643a0b5c12d4b4664f..61d1eb0e373341374199b811f198f7e295026ecc 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -182,7 +182,7 @@ class Accuracy(Metric):
     Encapsulates accuracy metric logic.
 
     Args:
-        topk (int|tuple(int)): Number of top elements to look at
+        topk (int|list[int]|tuple[int]): Number of top elements to look at
             for computing accuracy. Default is (1,).
         name (str, optional): String name of the metric instance. Default
             is `acc`.
@@ -243,7 +243,7 @@ class Accuracy(Metric):
 
     def compute(self, pred, label, *args):
         """
-        Compute the top-k (maxinum value in `topk`) indices.
+        Compute the top-k (maximum value in `topk`) indices.
 
         Args:
             pred (Tensor): The predicted value is a Tensor with dtype
@@ -253,7 +253,7 @@ class Accuracy(Metric):
                 [batch_size, d0, ..., num_classes] in one hot representation.
                 
         Return:
-            Tensor: Correct mask, a tensor with shape [batch_size, topk].
+            Tensor: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
         """
         pred = paddle.argsort(pred, descending=True)
         pred = paddle.slice(
@@ -277,7 +277,7 @@ class Accuracy(Metric):
         returns the accuracy of current step.
         
         Args:
-            correct: Correct mask, a tensor with shape [batch_size, topk].
+            correct: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
 
         Return:
             Tensor: the accuracy of current step.
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 3a552d588bed9c1e0b1fb917480e1195c686380d..7cf3f94872de17c12910ae5453f74a18c4a1502d 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -15,144 +15,276 @@
 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
 
-from .layer import norm
-from .functional import extension
-from .layer import common
-from .layer import rnn
-from .utils import weight_norm_hook
-
-from . import initializer
-
-__all__ = []
-__all__ += norm.__all__
-__all__ += extension.__all__
-__all__ += common.__all__
-__all__ += rnn.__all__
-__all__ += weight_norm_hook.__all__
-
-# TODO: define alias in nn directory
-from .clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
-from .clip import ClipGradByNorm  #DEFINE_ALIAS
-from .clip import ClipGradByValue  #DEFINE_ALIAS
-# from .control_flow import cond  #DEFINE_ALIAS
-# from .control_flow import DynamicRNN        #DEFINE_ALIAS
-# from .control_flow import StaticRNN        #DEFINE_ALIAS
-# from .control_flow import while_loop  #DEFINE_ALIAS
-# from .control_flow import rnn        #DEFINE_ALIAS
-from .decode import BeamSearchDecoder  #DEFINE_ALIAS
-from .decode import dynamic_decode  #DEFINE_ALIAS
-# from .decode import Decoder        #DEFINE_ALIAS
-# from .decode import crf_decoding        #DEFINE_ALIAS
-# from .decode import ctc_greedy_decoder        #DEFINE_ALIAS
-# from .input import Input        #DEFINE_ALIAS
-from .layer.activation import ELU  #DEFINE_ALIAS
-from .layer.activation import GELU  #DEFINE_ALIAS
-from .layer.activation import Tanh  #DEFINE_ALIAS
-from .layer.activation import Hardshrink  #DEFINE_ALIAS
-from .layer.activation import Hardswish  #DEFINE_ALIAS
-from .layer.activation import Hardtanh  #DEFINE_ALIAS
-from .layer.activation import PReLU  #DEFINE_ALIAS
-from .layer.activation import ReLU  #DEFINE_ALIAS
-from .layer.activation import ReLU6  #DEFINE_ALIAS
-from .layer.activation import SELU  #DEFINE_ALIAS
-from .layer.activation import LeakyReLU  #DEFINE_ALIAS
-from .layer.activation import Sigmoid  #DEFINE_ALIAS
-from .layer.activation import Hardsigmoid  #DEFINE_ALIAS
-from .layer.activation import LogSigmoid  #DEFINE_ALIAS
-from .layer.activation import Softmax  #DEFINE_ALIAS
-from .layer.activation import Softplus  #DEFINE_ALIAS
-from .layer.activation import Softshrink  #DEFINE_ALIAS
-from .layer.activation import Softsign  #DEFINE_ALIAS
-from .layer.activation import Swish  #DEFINE_ALIAS
-from .layer.activation import Tanhshrink  #DEFINE_ALIAS
-from .layer.activation import ThresholdedReLU  #DEFINE_ALIAS
-from .layer.activation import LogSoftmax  #DEFINE_ALIAS
-from .layer.activation import Maxout  #DEFINE_ALIAS
-from .layer.common import Pad1D  #DEFINE_ALIAS
-from .layer.common import Pad2D  #DEFINE_ALIAS
-from .layer.common import Pad3D  #DEFINE_ALIAS
-from .layer.common import CosineSimilarity  #DEFINE_ALIAS
-from .layer.common import Embedding  #DEFINE_ALIAS
-from .layer.common import Linear  #DEFINE_ALIAS
-from .layer.common import Flatten  #DEFINE_ALIAS
-from .layer.common import Upsample  #DEFINE_ALIAS
-from .layer.common import UpsamplingNearest2D  #DEFINE_ALIAS
-from .layer.common import UpsamplingBilinear2D  #DEFINE_ALIAS
-from .layer.common import Bilinear  #DEFINE_ALIAS
-from .layer.common import Dropout  #DEFINE_ALIAS
-from .layer.common import Dropout2D  #DEFINE_ALIAS
-from .layer.common import Dropout3D  #DEFINE_ALIAS
-from .layer.common import AlphaDropout  #DEFINE_ALIAS
-
-from .layer.pooling import AvgPool1D  #DEFINE_ALIAS
-from .layer.pooling import AvgPool2D  #DEFINE_ALIAS
-from .layer.pooling import AvgPool3D  #DEFINE_ALIAS
-from .layer.pooling import MaxPool1D  #DEFINE_ALIAS
-from .layer.pooling import MaxPool2D  #DEFINE_ALIAS
-from .layer.pooling import MaxPool3D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveAvgPool1D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveAvgPool2D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveAvgPool3D  #DEFINE_ALIAS
-
-from .layer.pooling import AdaptiveMaxPool1D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveMaxPool2D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveMaxPool3D  #DEFINE_ALIAS
-from .layer.conv import Conv1D  #DEFINE_ALIAS
-from .layer.conv import Conv2D  #DEFINE_ALIAS
-from .layer.conv import Conv3D  #DEFINE_ALIAS
-from .layer.conv import Conv1DTranspose  #DEFINE_ALIAS
-from .layer.conv import Conv2DTranspose  #DEFINE_ALIAS
-from .layer.conv import Conv3DTranspose  #DEFINE_ALIAS
-# from .layer.conv import TreeConv        #DEFINE_ALIAS
-# from .layer.conv import Conv1D        #DEFINE_ALIAS
-from .layer.common import Linear
-# from .layer.loss import NCELoss        #DEFINE_ALIAS
-from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
-from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
-from .layer.loss import HSigmoidLoss  #DEFINE_ALIAS
-from .layer.loss import MSELoss  #DEFINE_ALIAS
-from .layer.loss import L1Loss  #DEFINE_ALIAS
-from .layer.loss import NLLLoss  #DEFINE_ALIAS
-from .layer.loss import BCELoss  #DEFINE_ALIAS
-from .layer.loss import KLDivLoss  #DEFINE_ALIAS
-from .layer.loss import MarginRankingLoss  #DEFINE_ALIAS
-from .layer.loss import CTCLoss  #DEFINE_ALIAS
-from .layer.loss import SmoothL1Loss  #DEFINE_ALIAS
-from .layer.norm import BatchNorm  #DEFINE_ALIAS
-from .layer.norm import SyncBatchNorm  #DEFINE_ALIAS
-from .layer.norm import GroupNorm  #DEFINE_ALIAS
-from .layer.norm import LayerNorm  #DEFINE_ALIAS
-from .layer.norm import SpectralNorm  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm1D  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm2D  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm3D  #DEFINE_ALIAS
-from .layer.norm import BatchNorm1D  #DEFINE_ALIAS
-from .layer.norm import BatchNorm2D  #DEFINE_ALIAS
-from .layer.norm import BatchNorm3D  #DEFINE_ALIAS
-from .layer.norm import LocalResponseNorm  #DEFINE_ALIAS
-
-from .layer.rnn import RNNCellBase  #DEFINE_ALIAS
-from .layer.rnn import SimpleRNNCell  #DEFINE_ALIAS
-from .layer.rnn import LSTMCell  #DEFINE_ALIAS
-from .layer.rnn import GRUCell  #DEFINE_ALIAS
-from .layer.rnn import RNN  #DEFINE_ALIAS
-from .layer.rnn import BiRNN  #DEFINE_ALIAS
-from .layer.rnn import SimpleRNN  #DEFINE_ALIAS
-from .layer.rnn import LSTM  #DEFINE_ALIAS
-from .layer.rnn import GRU  #DEFINE_ALIAS
-
-from .layer.transformer import MultiHeadAttention
-from .layer.transformer import TransformerEncoderLayer
-from .layer.transformer import TransformerEncoder
-from .layer.transformer import TransformerDecoderLayer
-from .layer.transformer import TransformerDecoder
-from .layer.transformer import Transformer
-from .layer.distance import PairwiseDistance  #DEFINE_ALIAS
-
-from .layer.vision import PixelShuffle
-
-from .layer import loss  #DEFINE_ALIAS
-from .layer import conv  #DEFINE_ALIAS
-from .layer import vision  #DEFINE_ALIAS
-from ..fluid.dygraph.layers import Layer  #DEFINE_ALIAS
-from ..fluid.dygraph.container import LayerList, ParameterList, Sequential  #DEFINE_ALIAS
+from .clip import ClipGradByGlobalNorm  # noqa: F401
+from .clip import ClipGradByNorm  # noqa: F401
+from .clip import ClipGradByValue  # noqa: F401
+from .decode import BeamSearchDecoder  # noqa: F401
+from .decode import dynamic_decode  # noqa: F401
+from .layer.activation import ELU  # noqa: F401
+from .layer.activation import GELU  # noqa: F401
+from .layer.activation import Tanh  # noqa: F401
+from .layer.activation import Hardshrink  # noqa: F401
+from .layer.activation import Hardswish  # noqa: F401
+from .layer.activation import Hardtanh  # noqa: F401
+from .layer.activation import PReLU  # noqa: F401
+from .layer.activation import ReLU  # noqa: F401
+from .layer.activation import ReLU6  # noqa: F401
+from .layer.activation import SELU  # noqa: F401
+from .layer.activation import Silu  # noqa: F401
+from .layer.activation import LeakyReLU  # noqa: F401
+from .layer.activation import Sigmoid  # noqa: F401
+from .layer.activation import Hardsigmoid  # noqa: F401
+from .layer.activation import LogSigmoid  # noqa: F401
+from .layer.activation import Softmax  # noqa: F401
+from .layer.activation import Softplus  # noqa: F401
+from .layer.activation import Softshrink  # noqa: F401
+from .layer.activation import Softsign  # noqa: F401
+from .layer.activation import Swish  # noqa: F401
+from .layer.activation import Tanhshrink  # noqa: F401
+from .layer.activation import ThresholdedReLU  # noqa: F401
+from .layer.activation import LogSoftmax  # noqa: F401
+from .layer.activation import Maxout  # noqa: F401
+from .layer.common import Pad1D  # noqa: F401
+from .layer.common import Pad2D  # noqa: F401
+from .layer.common import Pad3D  # noqa: F401
+from .layer.common import CosineSimilarity  # noqa: F401
+from .layer.common import Embedding  # noqa: F401
+from .layer.common import Linear  # noqa: F401
+from .layer.common import Flatten  # noqa: F401
+from .layer.common import Upsample  # noqa: F401
+from .layer.common import UpsamplingNearest2D  # noqa: F401
+from .layer.common import UpsamplingBilinear2D  # noqa: F401
+from .layer.common import Bilinear  # noqa: F401
+from .layer.common import Dropout  # noqa: F401
+from .layer.common import Dropout2D  # noqa: F401
+from .layer.common import Dropout3D  # noqa: F401
+from .layer.common import AlphaDropout  # noqa: F401
+from .layer.common import Unfold  # noqa: F401
+
+from .layer.pooling import AvgPool1D  # noqa: F401
+from .layer.pooling import AvgPool2D  # noqa: F401
+from .layer.pooling import AvgPool3D  # noqa: F401
+from .layer.pooling import MaxPool1D  # noqa: F401
+from .layer.pooling import MaxPool2D  # noqa: F401
+from .layer.pooling import MaxPool3D  # noqa: F401
+from .layer.pooling import AdaptiveAvgPool1D  # noqa: F401
+from .layer.pooling import AdaptiveAvgPool2D  # noqa: F401
+from .layer.pooling import AdaptiveAvgPool3D  # noqa: F401
+from .layer.pooling import AdaptiveMaxPool1D  # noqa: F401
+from .layer.pooling import AdaptiveMaxPool2D  # noqa: F401
+from .layer.pooling import AdaptiveMaxPool3D  # noqa: F401
+
+from .layer.conv import Conv1D  # noqa: F401
+from .layer.conv import Conv2D  # noqa: F401
+from .layer.conv import Conv3D  # noqa: F401
+from .layer.conv import Conv1DTranspose  # noqa: F401
+from .layer.conv import Conv2DTranspose  # noqa: F401
+from .layer.conv import Conv3DTranspose  # noqa: F401
+
+from .layer.loss import BCEWithLogitsLoss  # noqa: F401
+from .layer.loss import CrossEntropyLoss  # noqa: F401
+from .layer.loss import HSigmoidLoss  # noqa: F401
+from .layer.loss import MSELoss  # noqa: F401
+from .layer.loss import L1Loss  # noqa: F401
+from .layer.loss import NLLLoss  # noqa: F401
+from .layer.loss import BCELoss  # noqa: F401
+from .layer.loss import KLDivLoss  # noqa: F401
+from .layer.loss import MarginRankingLoss  # noqa: F401
+from .layer.loss import CTCLoss  # noqa: F401
+from .layer.loss import SmoothL1Loss  # noqa: F401
+from .layer.norm import BatchNorm  # noqa: F401
+from .layer.norm import SyncBatchNorm  # noqa: F401
+from .layer.norm import GroupNorm  # noqa: F401
+from .layer.norm import LayerNorm  # noqa: F401
+from .layer.norm import SpectralNorm  # noqa: F401
+from .layer.norm import InstanceNorm1D  # noqa: F401
+from .layer.norm import InstanceNorm2D  # noqa: F401
+from .layer.norm import InstanceNorm3D  # noqa: F401
+from .layer.norm import BatchNorm1D  # noqa: F401
+from .layer.norm import BatchNorm2D  # noqa: F401
+from .layer.norm import BatchNorm3D  # noqa: F401
+from .layer.norm import LocalResponseNorm  # noqa: F401
+
+from .layer.rnn import RNNCellBase  # noqa: F401
+from .layer.rnn import SimpleRNNCell  # noqa: F401
+from .layer.rnn import LSTMCell  # noqa: F401
+from .layer.rnn import GRUCell  # noqa: F401
+from .layer.rnn import RNN  # noqa: F401
+from .layer.rnn import BiRNN  # noqa: F401
+from .layer.rnn import SimpleRNN  # noqa: F401
+from .layer.rnn import LSTM  # noqa: F401
+from .layer.rnn import GRU  # noqa: F401
+
+from .layer.transformer import MultiHeadAttention  # noqa: F401
+from .layer.transformer import TransformerEncoderLayer  # noqa: F401
+from .layer.transformer import TransformerEncoder  # noqa: F401
+from .layer.transformer import TransformerDecoderLayer  # noqa: F401
+from .layer.transformer import TransformerDecoder  # noqa: F401
+from .layer.transformer import Transformer  # noqa: F401
+from .layer.distance import PairwiseDistance  # noqa: F401
+
+from .layer.vision import PixelShuffle  # noqa: F401
+from .layer.container import LayerDict  # noqa: F401
+
+from .utils.spectral_norm_hook import spectral_norm
+
+# TODO: remove loss, keep it for too many used in unitests
+from .layer import loss  # noqa: F401
+from ..fluid.dygraph.layers import Layer  # noqa: F401
+from ..fluid.dygraph.container import LayerList  # noqa: F401
+from ..fluid.dygraph.container import ParameterList  # noqa: F401
+from ..fluid.dygraph.container import Sequential  # noqa: F401
+
+from . import utils  # noqa: F401
+from . import functional  # noqa: F401
+from . import initializer  # noqa: F401
+
+#TODO: remove 'diag_embed', 'remove_weight_norm', 'weight_norm' months later.
+import paddle.utils.deprecated as deprecated
+
+
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.nn.funcitional.diag_embed",
+    level=1,
+    reason="diag_embed in paddle.nn will be removed in future")
+def diag_embed(*args):
+    '''
+        alias name of paddle.nn.functional.diag_embed
+    '''
+    return functional.diag_embed(*args)
+
+
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.nn.utils.remove_weight_norm",
+    level=1,
+    reason="remove_weight_norm in paddle.nn will be removed in future")
+def remove_weight_norm(*args):
+    '''
+        alias name of paddle.nn.utils.remove_weight_norm
+    '''
+    return utils.remove_weight_norm(*args)
+
+
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.nn.utils.weight_norm",
+    level=1,
+    reason="weight_norm in paddle.nn will be removed in future")
+def weight_norm(*args):
+    '''
+        alias name of paddle.nn.utils.weight_norm
+    '''
+    return utils.weight_norm(*args)
+
+
+__all__ = [     #noqa
+           'BatchNorm',
+           'GroupNorm',
+           'LayerNorm',
+           'SpectralNorm',
+           'BatchNorm1D',
+           'BatchNorm2D',
+           'BatchNorm3D',
+           'InstanceNorm1D',
+           'InstanceNorm2D',
+           'InstanceNorm3D',
+           'SyncBatchNorm',
+           'LocalResponseNorm',
+           'Embedding',
+           'Linear',
+           'Upsample',
+           'UpsamplingNearest2D',
+           'UpsamplingBilinear2D',
+           'Pad1D',
+           'Pad2D',
+           'Pad3D',
+           'CosineSimilarity',
+           'Dropout',
+           'Dropout2D',
+           'Dropout3D',
+           'Bilinear',
+           'AlphaDropout',
+           'Unfold',
+           'RNNCellBase',
+           'SimpleRNNCell',
+           'LSTMCell',
+           'GRUCell',
+           'RNN',
+           'BiRNN',
+           'SimpleRNN',
+           'LSTM',
+           'GRU',
+           'dynamic_decode',
+           'MultiHeadAttention',
+           'Maxout',
+           'Softsign',
+           'Transformer',
+           'MSELoss',
+           'LogSigmoid',
+           'BeamSearchDecoder',
+           'ClipGradByNorm',
+           'ReLU',
+           'PairwiseDistance',
+           'BCEWithLogitsLoss',
+           'SmoothL1Loss',
+           'MaxPool3D',
+           'AdaptiveMaxPool2D',
+           'Hardshrink',
+           'Softplus',
+           'KLDivLoss',
+           'AvgPool2D',
+           'L1Loss',
+           'LeakyReLU',
+           'AvgPool1D',
+           'AdaptiveAvgPool3D',
+           'AdaptiveMaxPool3D',
+           'NLLLoss',
+           'Conv1D',
+           'Sequential',
+           'Hardswish',
+           'Conv1DTranspose',
+           'AdaptiveMaxPool1D',
+           'TransformerEncoder',
+           'Softmax',
+           'ParameterList',
+           'Conv2D',
+           'Softshrink',
+           'Hardtanh',
+           'TransformerDecoderLayer',
+           'CrossEntropyLoss',
+           'GELU',
+           'SELU',
+           'Silu',
+           'Conv2DTranspose',
+           'CTCLoss',
+           'ThresholdedReLU',
+           'AdaptiveAvgPool2D',
+           'MaxPool1D',
+           'Layer',
+           'TransformerDecoder',
+           'Conv3D',
+           'Tanh',
+           'Conv3DTranspose',
+           'Flatten',
+           'AdaptiveAvgPool1D',
+           'Tanhshrink',
+           'HSigmoidLoss',
+           'PReLU',
+           'TransformerEncoderLayer',
+           'AvgPool3D',
+           'MaxPool2D',
+           'MarginRankingLoss',
+           'LayerList',
+           'ClipGradByValue',
+           'BCELoss',
+           'Hardsigmoid',
+           'ClipGradByGlobalNorm',
+           'LogSoftmax',
+           'Sigmoid',
+           'Swish',
+           'PixelShuffle',
+           'ELU',
+           'ReLU6'
+]
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index 9180a883e835c367530ba449a7a2dccf158dedf5..e868cbdbacc171531c31cff671b8cfb7b73f0c93 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 # TODO: define the functions to clip gradient of parameter  
-from ..fluid.clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
-from ..fluid.clip import ClipGradByNorm  #DEFINE_ALIAS
-from ..fluid.clip import ClipGradByValue  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByGlobalNorm  # noqa: F401
+from ..fluid.clip import ClipGradByNorm  # noqa: F401
+from ..fluid.clip import ClipGradByValue  # noqa: F401
 
-__all__ = ['ClipGradByGlobalNorm', 'ClipGradByNorm', 'ClipGradByValue']
+__all__ = []
diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py
index bba5aba0da9ad024823750c32f2e02bb22dfbbbb..ff4a6e4f482af5958c76079c9987cc20e5ea935d 100644
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..fluid.layers import BeamSearchDecoder  #DEFINE_ALIAS
-from ..fluid.layers import dynamic_decode  #DEFINE_ALIAS
+from ..fluid.layers import BeamSearchDecoder  # noqa: F401
+from ..fluid.layers import dynamic_decode  # noqa: F401
 
-__all__ = [
-    'BeamSearchDecoder',
-    'dynamic_decode',
-]
+__all__ = []
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 36f39a5056ed5444f5b84a63cff4ece7d3c4d223..d4c17a27a61780b431916b2634585de035778ce8 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -14,208 +14,185 @@
 
 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
-__all__ = []
 
-# TODO: define alias in functional directory
-from . import conv
-__all__ += conv.__all__
-from . import activation
-__all__ += activation.__all__
-from . import extension
-__all__ += extension.__all__
-from . import common
-__all__ += common.__all__
-from . import pooling
-__all__ += pooling.__all__
-from . import loss
-__all__ += loss.__all__
-from .activation import elu  #DEFINE_ALIAS
-from .activation import elu_  #DEFINE_ALIAS
-# from .activation import erf  #DEFINE_ALIAS
-from .activation import gelu  #DEFINE_ALIAS
-from .activation import hardshrink  #DEFINE_ALIAS
-from .activation import hardtanh  #DEFINE_ALIAS
-from .activation import hardsigmoid  #DEFINE_ALIAS
-from .activation import hardswish  #DEFINE_ALIAS
-from .activation import leaky_relu  #DEFINE_ALIAS
-from .activation import log_sigmoid  #DEFINE_ALIAS
-from .activation import maxout  #DEFINE_ALIAS
-from .activation import prelu  #DEFINE_ALIAS
-from .activation import relu  #DEFINE_ALIAS
-from .activation import relu_  #DEFINE_ALIAS
-from .activation import relu6  #DEFINE_ALIAS
-from .activation import selu  #DEFINE_ALIAS
-from .activation import sigmoid  #DEFINE_ALIAS
-# from .activation import soft_relu  #DEFINE_ALIAS
-from .activation import softmax  #DEFINE_ALIAS
-from .activation import softmax_  #DEFINE_ALIAS
-from .activation import softplus  #DEFINE_ALIAS
-from .activation import softshrink  #DEFINE_ALIAS
-from .activation import softsign  #DEFINE_ALIAS
-from .activation import swish  #DEFINE_ALIAS
-from .activation import tanh  #DEFINE_ALIAS
-from .activation import tanh_  #DEFINE_ALIAS
-from .activation import tanhshrink  #DEFINE_ALIAS
-from .activation import thresholded_relu  #DEFINE_ALIAS
-from .activation import log_softmax  #DEFINE_ALIAS
-from .common import dropout  #DEFINE_ALIAS
-from .common import dropout2d  #DEFINE_ALIAS
-from .common import dropout3d  #DEFINE_ALIAS
-from .common import alpha_dropout  #DEFINE_ALIAS
-# from .common import embedding        #DEFINE_ALIAS
-# from .common import fc  #DEFINE_ALIAS
-from .common import label_smooth
-# from .common import one_hot  #DEFINE_ALIAS
-from .common import pad  #DEFINE_ALIAS
-# from .common import pad_constant_like  #DEFINE_ALIAS
-# from .common import pad2d  #DEFINE_ALIAS
-from .common import cosine_similarity  #DEFINE_ALIAS
-from .common import unfold  #DEFINE_ALIAS
-# from .common import bilinear_tensor_product        #DEFINE_ALIAS
-from .common import interpolate  #DEFINE_ALIAS
-from .common import upsample  #DEFINE_ALIAS
-from .common import bilinear  #DEFINE_ALIAS
-from .conv import conv1d  #DEFINE_ALIAS
-from .conv import conv1d_transpose  #DEFINE_ALIAS
-from .common import linear  #DEFINE_ALIAS
-from .conv import conv2d  #DEFINE_ALIAS
-from .conv import conv2d_transpose  #DEFINE_ALIAS
-from .conv import conv3d  #DEFINE_ALIAS
-from .conv import conv3d_transpose  #DEFINE_ALIAS
-# from .extension import add_position_encoding  #DEFINE_ALIAS
-# from .extension import autoincreased_step_counter        #DEFINE_ALIAS
-# from .extension import continuous_value_model  #DEFINE_ALIAS
-# from .extension import filter_by_instag  #DEFINE_ALIAS
-# from .extension import linear_chain_crf        #DEFINE_ALIAS
-# from .extension import merge_selected_rows        #DEFINE_ALIAS
-# from .extension import multiclass_nms  #DEFINE_ALIAS
-# from .extension import polygon_box_transform  #DEFINE_ALIAS
-# from .extension import random_crop  #DEFINE_ALIAS
-# from .extension import rpn_target_assign  #DEFINE_ALIAS
-# from .extension import similarity_focus  #DEFINE_ALIAS
-# from .extension import target_assign  #DEFINE_ALIAS
-# from .extension import temporal_shift  #DEFINE_ALIAS
-# from .extension import warpctc  #DEFINE_ALIAS
-from .extension import diag_embed  #DEFINE_ALIAS
-# from .lod import sequence_concat        #DEFINE_ALIAS
-# from .lod import sequence_conv        #DEFINE_ALIAS
-# from .lod import sequence_enumerate        #DEFINE_ALIAS
-# from .lod import sequence_expand_as        #DEFINE_ALIAS
-# from .lod import sequence_expand        #DEFINE_ALIAS
-# from .lod import sequence_first_step        #DEFINE_ALIAS
-# from .lod import sequence_last_step        #DEFINE_ALIAS
-# from .lod import sequence_mask        #DEFINE_ALIAS
-# from .lod import sequence_pad        #DEFINE_ALIAS
-# from .lod import sequence_pool        #DEFINE_ALIAS
-# from .lod import sequence_reshape        #DEFINE_ALIAS
-# from .lod import sequence_reverse        #DEFINE_ALIAS
-# from .lod import sequence_scatter        #DEFINE_ALIAS
-# from .lod import sequence_slice        #DEFINE_ALIAS
-# from .lod import sequence_softmax        #DEFINE_ALIAS
-# from .lod import sequence_unpad        #DEFINE_ALIAS
-# from .lod import array_length        #DEFINE_ALIAS
-# from .lod import array_read        #DEFINE_ALIAS
-# from .lod import array_write        #DEFINE_ALIAS
-# from .lod import create_array        #DEFINE_ALIAS
-# from .lod import hash  #DEFINE_ALIAS
-# from .lod import im2sequence        #DEFINE_ALIAS
-# from .lod import lod_append        #DEFINE_ALIAS
-# from .lod import lod_reset        #DEFINE_ALIAS
-# from .lod import reorder_lod_tensor_by_rank        #DEFINE_ALIAS
-# from .lod import tensor_array_to_tensor        #DEFINE_ALIAS
-# from .lod import dynamic_gru        #DEFINE_ALIAS
-# from .lod import dynamic_lstm        #DEFINE_ALIAS
-# from .lod import dynamic_lstmp        #DEFINE_ALIAS
-from .loss import binary_cross_entropy  #DEFINE_ALIAS
-from .loss import binary_cross_entropy_with_logits  #DEFINE_ALIAS
-# from .loss import bpr_loss  #DEFINE_ALIAS
-# from .loss import center_loss  #DEFINE_ALIAS
-#from .loss import cross_entropy  #DEFINE_ALIAS
-from .loss import cross_entropy  #DEFINE_ALIAS
-from .loss import dice_loss  #DEFINE_ALIAS
-from .loss import hsigmoid_loss  #DEFINE_ALIAS
-from .loss import kl_div  #DEFINE_ALIAS
-from .loss import l1_loss  #DEFINE_ALIAS
-from .loss import log_loss  #DEFINE_ALIAS
-from .loss import margin_ranking_loss  #DEFINE_ALIAS
-from .loss import mse_loss  #DEFINE_ALIAS
-from .loss import nll_loss  #DEFINE_ALIAS
-# from .loss import nce        #DEFINE_ALIAS
-from .loss import npair_loss  #DEFINE_ALIAS
-from .loss import sigmoid_focal_loss  #DEFINE_ALIAS
-# from .loss import smooth_l1  #DEFINE_ALIAS
-from .loss import smooth_l1_loss  #DEFINE_ALIAS
-from .loss import softmax_with_cross_entropy  #DEFINE_ALIAS
-from .loss import square_error_cost  #DEFINE_ALIAS
-# from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
-from .loss import ctc_loss  #DEFINE_ALIAS
-# from .norm import data_norm        #DEFINE_ALIAS
-# from .norm import group_norm        #DEFINE_ALIAS
-from .norm import batch_norm  #DEFINE_ALIAS
-from .norm import instance_norm  #DEFINE_ALIAS
-from .norm import layer_norm  #DEFINE_ALIAS
-from .norm import local_response_norm  #DEFINE_ALIAS
-from .norm import normalize  #DEFINE_ALIAS
-# from .norm import spectral_norm        #DEFINE_ALIAS
-# from .pooling import pool2d  #DEFINE_ALIAS
-# from .pooling import pool3d  #DEFINE_ALIAS
-from .pooling import avg_pool1d  #DEFINE_ALIAS
-from .pooling import avg_pool2d  #DEFINE_ALIAS
-from .pooling import avg_pool3d  #DEFINE_ALIAS
-from .pooling import max_pool1d  #DEFINE_ALIAS
-from .pooling import max_pool2d  #DEFINE_ALIAS
-from .pooling import max_pool3d  #DEFINE_ALIAS
+from .activation import elu  # noqa: F401
+from .activation import elu_  # noqa: F401
+from .activation import gelu  # noqa: F401
+from .activation import hardshrink  # noqa: F401
+from .activation import hardtanh  # noqa: F401
+from .activation import hardsigmoid  # noqa: F401
+from .activation import hardswish  # noqa: F401
+from .activation import leaky_relu  # noqa: F401
+from .activation import log_sigmoid  # noqa: F401
+from .activation import maxout  # noqa: F401
+from .activation import prelu  # noqa: F401
+from .activation import relu  # noqa: F401
+from .activation import relu_  # noqa: F401
+from .activation import relu6  # noqa: F401
+from .activation import selu  # noqa: F401
+from .activation import sigmoid  # noqa: F401
+from .activation import silu  # noqa: F401
+from .activation import softmax  # noqa: F401
+from .activation import softmax_  # noqa: F401
+from .activation import softplus  # noqa: F401
+from .activation import softshrink  # noqa: F401
+from .activation import softsign  # noqa: F401
+from .activation import swish  # noqa: F401
+from .activation import tanh  # noqa: F401
+from .activation import tanh_  # noqa: F401
+from .activation import tanhshrink  # noqa: F401
+from .activation import thresholded_relu  # noqa: F401
+from .activation import log_softmax  # noqa: F401
+from .activation import glu  # noqa: F401
+from .common import dropout  # noqa: F401
+from .common import dropout2d  # noqa: F401
+from .common import dropout3d  # noqa: F401
+from .common import alpha_dropout  # noqa: F401
+from .common import label_smooth  # noqa: F401
+from .common import pad  # noqa: F401
+from .common import cosine_similarity  # noqa: F401
+from .common import unfold  # noqa: F401
+from .common import interpolate  # noqa: F401
+from .common import upsample  # noqa: F401
+from .common import bilinear  # noqa: F401
+from .conv import conv1d  # noqa: F401
+from .conv import conv1d_transpose  # noqa: F401
+from .common import linear  # noqa: F401
+from .conv import conv2d  # noqa: F401
+from .conv import conv2d_transpose  # noqa: F401
+from .conv import conv3d  # noqa: F401
+from .conv import conv3d_transpose  # noqa: F401
+from .extension import diag_embed  # noqa: F401
+from .extension import sequence_mask
+from .loss import binary_cross_entropy  # noqa: F401
+from .loss import binary_cross_entropy_with_logits  # noqa: F401
+from .loss import cross_entropy  # noqa: F401
+from .loss import dice_loss  # noqa: F401
+from .loss import hsigmoid_loss  # noqa: F401
+from .loss import kl_div  # noqa: F401
+from .loss import l1_loss  # noqa: F401
+from .loss import log_loss  # noqa: F401
+from .loss import margin_ranking_loss  # noqa: F401
+from .loss import mse_loss  # noqa: F401
+from .loss import nll_loss  # noqa: F401
+from .loss import npair_loss  # noqa: F401
+from .loss import sigmoid_focal_loss  # noqa: F401
+from .loss import smooth_l1_loss  # noqa: F401
+from .loss import softmax_with_cross_entropy  # noqa: F401
+from .loss import square_error_cost  # noqa: F401
+from .loss import ctc_loss  # noqa: F401
+from .norm import batch_norm  # noqa: F401
+from .norm import instance_norm  # noqa: F401
+from .norm import layer_norm  # noqa: F401
+from .norm import local_response_norm  # noqa: F401
+from .norm import normalize  # noqa: F401
+from .pooling import avg_pool1d  # noqa: F401
+from .pooling import avg_pool2d  # noqa: F401
+from .pooling import avg_pool3d  # noqa: F401
+from .pooling import max_pool1d  # noqa: F401
+from .pooling import max_pool2d  # noqa: F401
+from .pooling import max_pool3d  # noqa: F401
 
-from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_max_pool2d  #DEFINE_ALIAS
-from .pooling import adaptive_max_pool3d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool1d  # noqa: F401
+from .pooling import adaptive_max_pool2d  # noqa: F401
+from .pooling import adaptive_max_pool3d  # noqa: F401
+from .pooling import adaptive_avg_pool1d  # noqa: F401
+from .pooling import adaptive_avg_pool2d  # noqa: F401
+from .pooling import adaptive_avg_pool3d  # noqa: F401
 
-# from .rnn import rnn  #DEFINE_ALIAS
-# from .rnn import birnn  #DEFINE_ALIAS
-# from .rnn import gru_unit        #DEFINE_ALIAS
-# from .rnn import lstm        #DEFINE_ALIAS
-# from .rnn import lstm_unit        #DEFINE_ALIAS
-# from .vision import affine_channel  #DEFINE_ALIAS
-from .vision import affine_grid  #DEFINE_ALIAS
-# from .vision import anchor_generator  #DEFINE_ALIAS
-# from .vision import bipartite_match  #DEFINE_ALIAS
-# from .vision import box_clip  #DEFINE_ALIAS
-# from .vision import box_coder  #DEFINE_ALIAS
-# from .vision import box_decoder_and_assign  #DEFINE_ALIAS
-# from .vision import collect_fpn_proposals  #DEFINE_ALIAS
-# from .vision import deformable_conv  #DEFINE_ALIAS
-# from .vision import deformable_roi_pooling  #DEFINE_ALIAS
-# from .vision import density_prior_box  #DEFINE_ALIAS
-# from .vision import detection_output  #DEFINE_ALIAS
-# from .vision import distribute_fpn_proposals  #DEFINE_ALIAS
-# from .vision import fsp_matrix  #DEFINE_ALIAS
-# from .vision import generate_mask_labels  #DEFINE_ALIAS
-# from .vision import generate_proposal_labels  #DEFINE_ALIAS
-# from .vision import generate_proposals  #DEFINE_ALIAS
-from .vision import grid_sample  #DEFINE_ALIAS
-# from .vision import image_resize  #DEFINE_ALIAS
-# from .vision import image_resize_short  #DEFINE_ALIAS
-# from .vision import multi_box_head  #DEFINE_ALIAS
-from .vision import pixel_shuffle  #DEFINE_ALIAS
-# from .vision import prior_box  #DEFINE_ALIAS
-# from .vision import prroi_pool  #DEFINE_ALIAS
-# from .vision import psroi_pool  #DEFINE_ALIAS
-# from .vision import resize_bilinear  #DEFINE_ALIAS
-# from .vision import resize_nearest  #DEFINE_ALIAS
-# from .vision import resize_trilinear  #DEFINE_ALIAS
-# from .vision import retinanet_detection_output  #DEFINE_ALIAS
-# from .vision import retinanet_target_assign  #DEFINE_ALIAS
-# from .vision import roi_align  #DEFINE_ALIAS
-# from .vision import roi_perspective_transform  #DEFINE_ALIAS
-# from .vision import roi_pool  #DEFINE_ALIAS
-# from .vision import shuffle_channel  #DEFINE_ALIAS
-# from .vision import space_to_depth  #DEFINE_ALIAS
-# from .vision import yolo_box  #DEFINE_ALIAS
-# from .vision import yolov3_loss  #DEFINE_ALIAS
-from .input import one_hot  #DEFINE_ALIAS
-from .input import embedding  #DEFINE_ALIAS
-from ...fluid.layers import gather_tree
-from ...fluid.layers import temporal_shift
+from .vision import affine_grid  # noqa: F401
+from .vision import grid_sample  # noqa: F401
+from .vision import pixel_shuffle  # noqa: F401
+from .input import one_hot  # noqa: F401
+from .input import embedding  # noqa: F401
+from ...fluid.layers import gather_tree  # noqa: F401
+from ...fluid.layers import temporal_shift  # noqa: F401
+
+__all__ = [     #noqa
+           'conv1d',
+           'conv1d_transpose',
+           'conv2d',
+           'conv2d_transpose',
+           'conv3d',
+           'conv3d_transpose',
+           'elu',
+           'elu_',
+           'gelu',
+           'hardshrink',
+           'hardtanh',
+           'hardsigmoid',
+           'hardswish',
+           'leaky_relu',
+           'log_sigmoid',
+           'maxout',
+           'prelu',
+           'relu',
+           'relu_',
+           'relu6',
+           'selu',
+           'softmax',
+           'softmax_',
+           'softplus',
+           'softshrink',
+           'softsign',
+           'sigmoid',
+           'silu',
+           'swish',
+           'tanh',
+           'tanh_',
+           'tanhshrink',
+           'thresholded_relu',
+           'log_softmax',
+           'glu',
+           'diag_embed',
+           'sequence_mask',
+           'dropout',
+           'dropout2d',
+           'dropout3d',
+           'alpha_dropout',
+           'label_smooth',
+           'linear',
+           'pad',
+           'unfold',
+           'interpolate',
+           'upsample',
+           'bilinear',
+           'cosine_similarity',
+           'avg_pool1d',
+           'avg_pool2d',
+           'avg_pool3d',
+           'max_pool1d',
+           'max_pool2d',
+           'max_pool3d',
+           'adaptive_avg_pool1d',
+           'adaptive_avg_pool2d',
+           'adaptive_avg_pool3d',
+           'adaptive_max_pool1d',
+           'adaptive_max_pool2d',
+           'adaptive_max_pool3d',
+           'binary_cross_entropy',
+           'binary_cross_entropy_with_logits',
+           'cross_entropy',
+           'dice_loss',
+           'hsigmoid_loss',
+           'kl_div',
+           'l1_loss',
+           'log_loss',
+           'mse_loss',
+           'margin_ranking_loss',
+           'nll_loss',
+           'npair_loss',
+           'sigmoid_focal_loss',
+           'smooth_l1_loss',
+           'softmax_with_cross_entropy',
+           'square_error_cost',
+           'ctc_loss',
+           'affine_grid',
+           'grid_sample',
+           'local_response_norm',
+           'pixel_shuffle',
+           'embedding',
+           'gather_tree',
+           'one_hot',
+           'normalize'
+]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 3553a93dfab20db7b8a4553ac836ce1a5ecb8018..d5dc6322522bb461b479933d2292c11887d06847 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -12,48 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define activation functions of neural network
-from ...fluid.layers import brelu  #DEFINE_ALIAS
-# from ...fluid.layers import erf  #DEFINE_ALIAS
-from ...fluid.layers import maxout  #DEFINE_ALIAS
-# from ...fluid.layers import soft_relu  #DEFINE_ALIAS
-from ...fluid.layers import swish  #DEFINE_ALIAS
-from ...fluid.layers import sigmoid  #DEFINE_ALIAS
-from ...tensor.math import tanh  #DEFINE_ALIAS
-from ...tensor.math import tanh_  #DEFINE_ALIAS
-
-from ...tensor.manipulation import _print_warning_in_static_mode
-
-__all__ = [
-    'brelu',
-    'elu',
-    'elu_',
-    'gelu',
-    'hardshrink',
-    'hardtanh',
-    'hardsigmoid',
-    'hardswish',
-    'leaky_relu',
-    'log_sigmoid',
-    'maxout',
-    'prelu',
-    'relu',
-    'relu_',
-    'relu6',
-    'selu',
-    'softmax',
-    'softmax_',
-    'softplus',
-    'softshrink',
-    'softsign',
-    'sigmoid',
-    'swish',
-    'tanh',
-    'tanh_',
-    'tanhshrink',
-    'thresholded_relu',
-    'log_softmax',
-]
+from ...fluid.layers import sigmoid  # noqa: F401
+from ...tensor.math import tanh  # noqa: F401
+from ...tensor.math import tanh_  # noqa: F401
+
+from ...fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
+from ...tensor.manipulation import chunk
+from ...tensor.math import multiply
 
 import warnings
 from ...fluid.layer_helper import LayerHelper
@@ -62,6 +27,8 @@ from ...fluid import core
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 import paddle
 
+__all__ = []
+
 
 def elu(x, alpha=1.0, name=None):
     r"""
@@ -106,17 +73,13 @@ def elu(x, alpha=1.0, name=None):
     return out
 
 
+@inplace_apis_in_dygraph_only
 def elu_(x, alpha=1.0, name=None):
     r"""
     Inplace version of ``elu`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_elu`.
     """
-
-    if in_dygraph_mode():
-        return core.ops.elu_(x, 'alpha', alpha)
-
-    _print_warning_in_static_mode("elu")
-    return elu(x, alpha, name)
+    return core.ops.elu_(x, 'alpha', alpha)
 
 
 def gelu(x, approximate=False, name=None):
@@ -534,17 +497,13 @@ def relu(x, name=None):
     return out
 
 
+@inplace_apis_in_dygraph_only
 def relu_(x, name=None):
     """
     Inplace version of ``relu`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_relu`.
     """
-
-    if in_dygraph_mode():
-        return core.ops.relu_(x)
-
-    _print_warning_in_static_mode("relu")
-    return relu(x, name)
+    return core.ops.relu_(x)
 
 
 def log_sigmoid(x, name=None):
@@ -758,6 +717,39 @@ def selu(x,
     return out
 
 
+def silu(x, name=None):
+    """
+    silu activation.
+    .. math:
+        silu(x) = \frac{x}{1 + e^{-x}}
+    
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+    
+    Examples:
+        .. code-block:: python
+        import paddle
+        import paddle.nn.functional as F
+        
+        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+        out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.silu(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'silu')
+    helper = LayerHelper("silu", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='silu', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
 def softmax(x, axis=-1, dtype=None, name=None):
     r"""
     This operator implements the softmax layer. The calculation process is as follows:
@@ -912,21 +904,16 @@ def softmax(x, axis=-1, dtype=None, name=None):
     return outs_softmax
 
 
+@inplace_apis_in_dygraph_only
 def softmax_(x, axis=-1, dtype=None, name=None):
     r"""
     Inplace version of ``softmax`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_softmax`.
     """
-
     if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
     use_cudnn = True
-
-    if in_dygraph_mode():
-        return core.ops.softmax_(x, 'axis', axis, 'use_cudnn', use_cudnn)
-
-    _print_warning_in_static_mode("softmax")
-    return softmax(x, axis, dtype, name)
+    return core.ops.softmax_(x, 'axis', axis, 'use_cudnn', use_cudnn)
 
 
 def softplus(x, beta=1, threshold=20, name=None):
@@ -1276,3 +1263,50 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
         attrs={'axis': axis})
 
     return out
+
+
+def glu(x, axis=-1, name=None):
+    r"""
+    The gated linear unit. The input is evenly splited into 2 parts along a 
+    given axis. The first part is used as the content, and the second part is
+    passed through a sigmoid function then used as the gate. The output is a
+    elementwise multiplication of the content and the gate.
+
+    .. math::
+
+        \mathrm{GLU}(a, b) = a \otimes \sigma(b)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int, optional): The axis along which split the input tensor. It 
+            should be in range [-D, D), where D is the dimensions of ``x`` . 
+            If ``axis`` < 0, it works the same way as :math:`axis + D` . 
+            Default is -1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type as x. The size of the given aixs is 
+        halved.
+    
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            from paddle.nn import functional as F
+            
+            x = paddle.to_tensor(
+                [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
+                 [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
+            )
+            print(F.glu(x).numpy())
+            # array([[-0.15216254, -0.9004892 ],
+            #        [-1.0577879 , -0.46985325]], dtype=float32)
+        
+    """
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             "glu")
+    a, b = chunk(x, 2, axis=axis, name=name)
+    gate = sigmoid(b, name=name)
+    out = paddle.multiply(a, gate, name=name)
+    return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 0859d05af1cf90404024e5dcfe2a2b9e49ea54b1..65b9c6771c4f190e74b5228d5233d504d5b3fc09 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -20,43 +20,21 @@ from paddle.fluid.layers.tensor import Variable, fill_constant, zeros, concat
 from ...fluid.layers import core
 from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
-# from ...fluid import one_hot  #DEFINE_ALIAS
-# from ...fluid.layers import pad2d  #DEFINE_ALIAS
-from ...fluid.layers import unfold  #DEFINE_ALIAS
-from ...fluid.layers import squeeze  #DEFINE_ALIAS
-from ...fluid.layers import unsqueeze  #DEFINE_ALIAS
+from ...fluid.layers import unfold  # noqa: F401
+from ...fluid.layers import squeeze
+from ...fluid.layers import unsqueeze
 from ...tensor import clip
 from ...tensor import sum
 from ...tensor import sqrt
-from ...tensor import sum  #DEFINE_ALIAS
-from ...tensor import sqrt  #DEFINE_ALIAS
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from ...fluid.framework import Variable, in_dygraph_mode, _varbase_creator
 
-#from ...fluid.layers import fc  #DEFINE_ALIAS
-# from ...fluid.layers import pad_constant_like  #DEFINE_ALIAS
 from ...fluid.framework import in_dygraph_mode
 from ...fluid import core, dygraph_utils
 from ...fluid import core, layers
 from ...fluid.data_feeder import check_variable_and_dtype
 
-__all__ = [
-    'dropout',
-    'dropout2d',
-    'dropout3d',
-    'alpha_dropout',
-    #       'embedding',
-    #       'fc',
-    'label_smooth',
-    'linear',
-    'pad',
-    'unfold',
-    #       'bilinear_tensor_product',
-    'interpolate',
-    'upsample',
-    'bilinear',
-    'cosine_similarity',
-]
+__all__ = []
 
 
 def interpolate(x,
@@ -207,7 +185,7 @@ def interpolate(x,
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor, its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -638,7 +616,7 @@ def upsample(x,
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -764,8 +742,8 @@ def dropout(x,
 
     Args:
         x (Tensor): The input tensor. The data type is float32 or float64.
-        p (float | int): Probability of setting units to zero. Default 0.5.
-        axis (int | list): The axis along which the dropout is performed. Default None.
+        p (float|int): Probability of setting units to zero. Default 0.5.
+        axis (int|list|tuple): The axis along which the dropout is performed. Default None.
         training (bool): A flag indicating whether it is in train phrase or not. Default True.
         mode(str): ['upscale_in_train'(default) | 'downscale_in_infer'].
 
@@ -896,7 +874,7 @@ def dropout(x,
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
-    if axis and not isinstance(axis, (int, list)):
+    if axis and not isinstance(axis, (int, list, tuple)):
         raise TypeError("datatype of axis argument should be int or list")
 
     if axis == None:  # commonly used dropout
@@ -955,7 +933,7 @@ def dropout(x,
 
             #get mask shape
             input_shape = x.shape
-            drop_axes = [axis] if isinstance(axis, int) else axis
+            drop_axes = [axis] if isinstance(axis, int) else list(axis)
             if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
                 raise ValueError("axis value should be greater than or equal to 0 and less than dimensions of x:{}, but get axis value:{} " \
                                  .format(len(input_shape), max(drop_axes)))
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 75dc62e530d0db81ee4126dc76918e2f08713d30..1edbc5f462ecd7b49270b74434604b700e5079d8 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -13,24 +13,17 @@
 # limitations under the License.
 from __future__ import print_function
 
-__all__ = [
-    'conv1d',
-    'conv1d_transpose',
-    'conv2d',
-    'conv2d_transpose',
-    'conv3d',
-    'conv3d_transpose',
-]
-
 import numpy as np
 from ...device import get_cudnn_version
 from ...fluid.framework import Variable, in_dygraph_mode
-from ...fluid import core, dygraph_utils
+from ...fluid import core, dygraph_utils, get_flags
 from ...fluid.layers import nn, utils
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid.param_attr import ParamAttr
 from ...fluid.layer_helper import LayerHelper
 
+__all__ = []
+
 
 def _is_list_or_tuple(input):
     return isinstance(input, (list, tuple))
@@ -112,10 +105,6 @@ def _conv_nd(x,
 
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
     origin_format = data_format
-    if origin_format == "NHWC" and op_type == "depthwise_conv2d":
-        x = nn.transpose(x, perm=[0, 3, 1, 2])
-        data_format = "NCHW"
-        channel_dim = 1
     if in_dygraph_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
@@ -159,10 +148,6 @@ def _conv_nd(x,
                        'use_mkldnn': use_mkldnn})
         else:
             out = pre_bias
-
-    if origin_format == "NHWC" and op_type == "depthwise_conv2d":
-        out = nn.transpose(out, perm=[0, 2, 3, 1])
-
     return out
 
 
@@ -226,7 +211,7 @@ def conv1d(x,
         weight (Tensor): The convolution kernel with shape [M, C/g, K], where M is
             the number of output channels, g is the number of groups, K is the kernel's size. 
         bias (Tensor, optional): The bias with shape [M,]. Default: None.
-        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
+        stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain one integers, (stride_size). Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding could be in one of the following forms.
             1. a string in ['valid', 'same'].
@@ -235,7 +220,7 @@ def conv1d(x,
             4. a list[int] or tuple[int] whose length is 2. It has the form  [pad_before, pad_after].
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain one integer, (dilation_size). Default: 1.
         groups (int, optional): The groups number of the conv1d function. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -258,7 +243,7 @@ def conv1d(x,
         ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `data_format` is not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ShapeError: If the input is not 3-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -422,7 +407,7 @@ def conv2d(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -449,8 +434,8 @@ def conv2d(x,
 
         ..  math::
 
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
         x (Tensor): The input is 4-D Tensor with shape [N, C, H, W], the data type 
@@ -459,8 +444,8 @@ def conv2d(x,
             the number of output channels, g is the number of groups, kH is the filter's
             height, kW is the filter's width. 
         bias (Tensor, optional): The bias with shape [M,].
-        stride (int|tuple): The stride size. It means the stride in convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+        stride (int|list|tuple): The stride size. It means the stride in convolution. 
+            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width). 
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
             on both sides for each dimension.If `padding` is a string, either 'VALID' or
@@ -472,8 +457,8 @@ def conv2d(x,
             when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|tuple): The dilation size. It means the spacing between the kernel
-            points. If dilation is a tuple, it must contain two integers, (dilation_height, 
+        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel
+            points. If dilation is a list/tuple, it must contain two integers, (dilation_height, 
             dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
         groups (int): The groups number of the Conv2D Layer. According to grouped
@@ -496,7 +481,7 @@ def conv2d(x,
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ShapeError: If the input is not 4-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -559,6 +544,13 @@ def conv2d(x,
     if (num_channels == groups and num_channels != 1 and
             num_filters % num_channels == 0):
         l_type = 'depthwise_conv2d'
+        if core.is_compiled_with_rocm():
+            use_cudnn = True
+        else:
+            use_cudnn = False
+
+    if (core.is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
+        ["FLAGS_conv2d_disable_cudnn"]):
         use_cudnn = False
 
     return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
@@ -628,7 +620,7 @@ def conv1d_transpose(x,
           so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`L_{out} = L^\prime_{out}`;
           else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
-          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+          and :math:`L^\prime_{out} + stride`.
 
     Args:
         x(Tensor): 3-D tensor with [N, C, L] or [N, L, C] format,
@@ -638,7 +630,7 @@ def conv1d_transpose(x,
             K is the size of the kernel.
         bias(Tensor, optional): The bias, a Tensor with shape [M, ].
         stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
-            If stride is a tuple, it must contain one integer, `(stride_size)`.
+            If stride is a list/tuple, it must contain one integer, `(stride_size)`.
             Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
              `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
@@ -646,7 +638,7 @@ def conv1d_transpose(x,
              If `padding` is a tuple or list, it could be in two forms:
              `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
         output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
-             If it is a tuple, it must contain one integer. Default: 0.
+             If it is a list/tuple, it must contain one integer. Default: 0.
         groups(int, optional): The groups number of the conv1d transpose function. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
@@ -654,14 +646,11 @@ def conv1d_transpose(x,
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
         dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
-            If dilation is a tuple, it must contain one integer, `(dilation_size)`.
+            If dilation is a list/tuple, it must contain one integer, `(dilation_size)`.
             Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain one integer, `(feature_length)`. None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None. output_size and filter_size
-            should not be None at the same time.
+            tuple/list, it must contain one integer, `(feature_length)`. None if use
+            filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
             The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
@@ -679,7 +668,7 @@ def conv1d_transpose(x,
     Raises:
         ValueError: If `data_format` is a string, but not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ValueError: If `output_size` and filter_size are None at the same time.
         ValueError: If `output_padding` is greater than `stride`.
@@ -855,7 +844,7 @@ def conv2d_transpose(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -894,8 +883,7 @@ def conv2d_transpose(x,
           If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; 
           else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, 
-          conv2d_transpose can compute the kernel size automatically.
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`.
 
     Args:
         x(Tensor): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
@@ -905,7 +893,7 @@ def conv2d_transpose(x,
             kH is the height of the kernel, and kW is the width of the kernel.
         bias(Tensor, optional): The bias, a Tensor with shape [M, ].
         stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width). 
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
             on both sides for each dimension. If `padding` is a string, either 'VALID' or 
@@ -926,14 +914,11 @@ def conv2d_transpose(x,
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
         dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
+            If dilation is a list/tuple, it must contain two integers, (dilation_height, dilation_width). 
             Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_height, image_width). None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size is specified, output_size and filter_size (weight)'s shape 
-            should follow the formula above. Default: None. output_size and filter_size 
-            should not be None at the same time.
+            tuple/list, it must contain two integers, (image_height, image_width). None if use
+            filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -951,7 +936,7 @@ def conv2d_transpose(x,
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 4-D Tensor.
@@ -1091,7 +1076,7 @@ def conv3d(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1128,8 +1113,8 @@ def conv3d(x,
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ].
-        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
-            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a 
+            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
@@ -1141,8 +1126,8 @@ def conv3d(x,
             when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
+        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
         groups (int): The groups number of the Conv3D Layer. According to grouped
@@ -1247,7 +1232,7 @@ def conv3d_transpose(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1290,8 +1275,7 @@ def conv3d_transpose(x,
           size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
           the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv3d_transpose can compute the kernel size automatically.
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`.
 
     Args:
         x(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
@@ -1301,7 +1285,7 @@ def conv3d_transpose(x,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ].
         stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height, 
             stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
             Default: stride = 1.
         padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
@@ -1323,14 +1307,12 @@ def conv3d_transpose(x,
             filters is only connected to the second half of the input channels.
             Default: groups=1
         dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
+            If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height, 
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
         output_size(int|list|tuple, optional): The output image size. If output size is a
-            tuple, it must contain three integers, (image_depth, image_height, image_width). This
-            parameter only works when filter_size is None. If output_size and filter_size are 
-            specified at the same time, They should follow the formula above. Default: None. 
-            Output_size and filter_size should not be None at the same time.
+            list/tuple, it must contain three integers, (image_depth, image_height, image_width).
+            None if use filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -1349,7 +1331,7 @@ def conv3d_transpose(x,
     Raises:
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 5-D Tensor.
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 3bbdb89f16c0a1569b4f1630e39ef7698a53b019..8a9597119ab8df98a7e192a376193691061ec7e4 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -14,8 +14,6 @@
 
 # TODO: define the extention functions
 
-__all__ = ['diag_embed']
-
 import numpy as np
 from ...fluid.data_feeder import check_dtype
 from ...fluid.layer_helper import LayerHelper
@@ -23,6 +21,9 @@ from ...fluid.framework import Variable, in_dygraph_mode
 from ...fluid.layers.tensor import assign
 from ...fluid import core, dygraph_utils
 from ...fluid.layers.layer_function_generator import templatedoc
+from ...fluid.layers.sequence_lod import sequence_mask
+
+__all__ = []
 
 
 def diag_embed(input, offset=0, dim1=-2, dim2=-1):
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index bf389717518ce2f844c3e5ae9c525b8edd121e20..67dc69c1a93b692c6652bfb220a4c547a14b71a9 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -19,7 +19,7 @@ from ...fluid.layer_helper import LayerHelper
 from ...fluid.layers import core
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 
-__all__ = ['one_hot', 'embedding']
+__all__ = []
 
 
 def one_hot(x, num_classes, name=None):
@@ -148,9 +148,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         sparse(bool): The flag indicating whether to use sparse update. This parameter only
             affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizers does not support sparse update,
-            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
-            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
-            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`.
             In these cases, sparse must be False. Default: False.
         padding_idx(int|long|None): padding_idx needs to be in the interval [-weight.shape[0], weight.shape[0]).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c223addc2607bf0b169f24444aca738f557e703d..eeb00625876468fac7ce3d1ebefd4b46a796d2c0 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,40 +24,22 @@ import paddle
 import paddle.fluid as fluid
 from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.layers.nn import _elementwise_op_in_dygraph
-from ...fluid.layers import dice_loss  #DEFINE_ALIAS
-from ...fluid.layers import log_loss  #DEFINE_ALIAS
-from ...fluid.layers import npair_loss  #DEFINE_ALIAS
+from ...fluid.layers import dice_loss  # noqa: F401
+from ...fluid.layers import log_loss  # noqa: F401
+from ...fluid.layers import npair_loss  # noqa: F401
 from ...fluid.layers import reshape
-from ...fluid.layers import softmax_with_cross_entropy  #DEFINE_ALIAS
-from ...fluid.layers import square_error_cost  #DEFINE_ALIAS
+from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy
+from ...fluid.layers import square_error_cost  # noqa: F401
 
-from ...fluid.layers import edit_distance  #DEFINE_ALIAS
+from ...fluid.layers import edit_distance  # noqa: F401
 from ...fluid.layers import huber_loss
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.framework import _varbase_creator
 from ...fluid.framework import Variable
+from paddle.utils import deprecated
 
-__all__ = [
-    'binary_cross_entropy',
-    'binary_cross_entropy_with_logits',
-    'cross_entropy',
-    'dice_loss',
-    'hsigmoid_loss',
-    'kl_div',
-    'l1_loss',
-    'log_loss',
-    'mse_loss',
-    'margin_ranking_loss',
-    #       'nce',
-    'nll_loss',
-    'npair_loss',
-    'sigmoid_focal_loss',
-    'smooth_l1_loss',
-    'softmax_with_cross_entropy',
-    'square_error_cost',
-    'ctc_loss',
-]
+__all__ = []
 
 
 def binary_cross_entropy(input, label, weight=None, reduction='mean',
@@ -682,7 +665,6 @@ def l1_loss(input, label, reduction='mean', name=None):
 
             import paddle
 
-            paddle.disable_static()
             input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
             label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
 
@@ -1022,7 +1004,8 @@ def ctc_loss(log_probs,
              input_lengths,
              label_lengths,
              blank=0,
-             reduction='mean'):
+             reduction='mean',
+             norm_by_times=False):
     """
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
@@ -1037,6 +1020,7 @@ def ctc_loss(log_probs,
         label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
         blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
         reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+        norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
 
     Returns:
         Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
@@ -1100,7 +1084,7 @@ def ctc_loss(log_probs,
 
     """
 
-    loss_out = fluid.layers.warpctc(log_probs, labels, blank, False,
+    loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
                                     input_lengths, label_lengths)
 
     loss_out = fluid.layers.squeeze(loss_out, [-1])
@@ -1112,6 +1096,25 @@ def ctc_loss(log_probs,
     return loss_out
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.nn.functional.cross_entropy",
+    level=1,
+    reason=(
+        'Please notice that behavior of "paddle.nn.functional.softmax_with_cross_entropy" '
+        'and "paddle.nn.functional.cross_entropy" is different.'))
+def softmax_with_cross_entropy(logits,
+                               label,
+                               soft_label=False,
+                               ignore_index=-100,
+                               numeric_stable_mode=True,
+                               return_softmax=False,
+                               axis=-1):
+    return fluid_softmax_with_cross_entropy(logits, label, soft_label,
+                                            ignore_index, numeric_stable_mode,
+                                            return_softmax, axis)
+
+
 def cross_entropy(input,
                   label,
                   weight=None,
@@ -1119,87 +1122,248 @@ def cross_entropy(input,
                   reduction='mean',
                   soft_label=False,
                   axis=-1,
+                  use_softmax=True,
                   name=None):
     r"""
-    This operator implements the cross entropy loss function with softmax. This function 
+    By default, this operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable gradient.
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
+    to provide a more numerically stable computing. 
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
-    single label.
+    This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    The equation is as follows:
+    By default, this operator will calculate the mean of the result, and you can also affect 
+    the default behavior by using the reduction parameter. Please refer to the part of 
+    parameters for details.
 
-    1) Hard label (one-hot label, so every sample has exactly one class)
+    This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
-    .. math::
+    The calculation of this operator includes the following two steps.
 
-        loss_j =  -\\text{logits}_{label_j} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+    - **1.softmax cross entropy**
 
-    2) Soft label (each sample can have a distribution over all classes)
+        1. Hard label (each sample can only be assigned into one category)
 
-    .. math::
+        1.1. when use_softmax=True
 
-        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
-        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
-        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+            .. math::
+              \\loss_j=-\text{logits}_{label_j}+\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right) , j = 1,...,N
 
- 
-    It is useful when training a classification problem with ``C`` classes.
+            where, N is the number of samples and C is the number of categories.
+
+        1.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\log\left({P}_{label_j}\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+        2. Soft label (each sample is assigned to multiple categories with a certain probability, and the probability sum is 1).
+
+        2.1. when use_softmax=True
+
+            .. math::
+              \\loss_j=-\sum_{i=0}^{C}\text{label}_i\left(\text{logits}_i-\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories.
+
+        2.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\sum_{j=0}^{C}\left({label}_j*\log\left({P}_{label_j}\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+
+
+    - **2. Weight and reduction processing**
+
+        1. Weight
+
+            If the ``weight`` parameter is ``None`` , go to the next step directly.
+
+            If the ``weight`` parameter is not ``None`` , the cross entropy of each sample is weighted by weight
+            according to soft_label = False or True as follows.
+
+            1.1. Hard labels (soft_label = False)
 
+            .. math::
+                \\loss_j=loss_j*weight[label_j] 
 
+
+            1.2. Soft labels (soft_label = True)
+
+             .. math::
+                \\loss_j=loss_j*\sum_{i}\left(weight[label_i]*logits_i\right)
+
+        2. reduction
+
+            2.1 if the ``reduction`` parameter is ``none`` 
+
+                Return the previous result directly
+
+            2.2 if the ``reduction`` parameter is ``sum`` 
+
+                Return the sum of the previous results
+
+            .. math::
+               \\loss=\sum_{j}loss_j
+
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
+            the ``weight`` parameter as follows. 
+
+            2.3.1. If the  ``weight``  parameter is ``None`` 
+
+                   Return the average value of the previous results
+
+             .. math::
+                \\loss=\sum_{j}loss_j/N
+
+                  where, N is the number of samples and C is the number of categories.
+
+            2.3.2. If the 'weight' parameter is not 'None', the weighted average value of the previous result will be returned
+
+            1. Hard labels (soft_label = False)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+
+            2. Soft labels (soft_label = True)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
+ 
+ 
     Parameters:
-        input (Tensor): Input tensor, the data type is float32, float64. Shape is
-	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, D1, D2,..., Dk, C), k >= 1.
-        label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
-	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
-	    (N, D1, D2,..., Dk), k >= 1.
-        weight (Tensor, optional):a manual rescaling weight given to each class. 
+
+        - **input** (Tensor)
+
+            Input tensor, the data type is float32, float64. Shape is
+	    :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+
+            Note: 
+
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                output of softmax operator, which will produce incorrect results.
+
+                2. when use_softmax=False, it expects the output of softmax operator.
+ 
+        - **label** (Tensor)
+
+            1. If soft_label=False, the shape is
+            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
+            the data type is int32, int64, float32, float64, where each value is [0, C-1].
+
+            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            and the sum of the labels for each sample should be 1.
+
+        - **weight** (Tensor, optional)
+
+            a manual rescaling weight given to each class. 
             If given, has to be a Tensor of size C and the data type is float32, float64. 
-            Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size,
+            Default is ``'None'`` .
+
+        - **ignore_index** (int64, optional)
+
+            Specifies a target value that is ignored
+            and does not contribute to the loss. A negative value means that no label 
+            value needs to be ignored. Only valid when soft_label = False.  
+            Default is ``-100`` .
+
+        - **reduction** (str, optional)
+
+            Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-        ignore_index (int64, optional): Specifies a target value that is ignored
-            and does not contribute to the input gradient. Default is ``-100``.
-        soft_label (bool): indicate whether label is soft. Default False, meaning that
-                the label is hard. If soft_label=True, the label is soft.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
-                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
-                              is the rank of input :attr:`logits`. Default: -1.
 
+        - **soft_label** (bool, optional)
+
+            Indicate whether label is soft. 
+            Default is ``False``.
+
+        - **axis** (int, optional)
+
+            The index of dimension to perform softmax calculations. 
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the 
+            number of dimensions of input :attr:`input`. 
+            Default is ``-1`` .
+
+        - **use_softmax** (bool, optional)
+
+            Indicate whether compute softmax before cross_entropy.
+            Default is ``True``.
+
+        - **name** (str, optional)
+
+            The name of the operator. Default is ``None`` .
+            For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Tensor.The tensor storing the cross_entropy_loss of input and label.
 
+        Tensor. Return the softmax cross_entropy loss of ``input`` and ``label``.
+        The data type is the same as input.
 
-    Examples:
-        .. code-block:: python
+        If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
 
-            import paddle
-            import numpy as np
+        If :attr:`reduction` is ``'none'``:
 
-            input_data = np.random.random([5, 100]).astype("float64")
-            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
-            weight_data = np.random.random([100]).astype("float64")
+        1. If soft_label = False, the dimension of return value is the same with ``label`` . 
 
-            input =  paddle.to_tensor(input_data)
-            label =  paddle.to_tensor(label_data)
-            weight = paddle.to_tensor(weight_data)
+        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+
+
+     Example1(hard labels):
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.seed(99999)
+            N=100
+            C=200
+            reduction='mean'
+            input =  paddle.rand([N, C], dtype='float64')  
+            label =  paddle.randint(0, C, shape=[N], dtype='int64')
+            weight = paddle.rand([C], dtype='float64') 
+            
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=reduction)
+            dy_ret = cross_entropy_loss(
+                                       input,
+                                       label)
+            print(dy_ret.numpy()) #[5.41993642]
+
+
+    Example2(soft labels):
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.seed(99999)
+            axis = -1
+            ignore_index = -100
+            N = 4
+            C = 3
+            shape = [N, C]
+            reduction='mean'
+            weight = None
+            logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels /= paddle.sum(labels, axis=axis, keepdim=True)
+            paddle_loss_mean = paddle.nn.functional.cross_entropy(
+                                                                  logits,  
+                                                                  labels, 
+                                                                  soft_label=True, 
+                                                                  axis=axis,
+                                                                  weight=weight,
+                                                                  reduction=reduction)
+            print(paddle_loss_mean.numpy()) #[1.12908343]
 
-            loss = paddle.nn.functional.cross_entropy(input=input, label=label, weight=weight)
-            print(loss)
-            # [4.28546723]
     """
 
     if reduction not in ['sum', 'mean', 'none']:
@@ -1207,6 +1371,12 @@ def cross_entropy(input,
             "The value of 'reduction' in softmax_cross_entropy"
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
             % reduction)
+    if ignore_index > 0 and soft_label == True:
+        raise ValueError(
+            "When soft_label == True, the value of 'ignore_index' in softmax_cross_entropy"
+            "should be '-100', but received %s, which is not allowed." %
+            ignore_index)
+
     input_dims = len(list(input.shape))
     label_dims = len(list(label.shape))
     if input_dims - 1 != label_dims and input_dims != label_dims:
@@ -1216,27 +1386,53 @@ def cross_entropy(input,
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
     if in_dygraph_mode():
-        out = softmax_with_cross_entropy(
-            input,
-            label,
-            soft_label=soft_label,
-            ignore_index=ignore_index,
-            axis=axis)
+        _, out = core.ops.softmax_with_cross_entropy(
+            input, label, 'soft_label', soft_label, 'ignore_index',
+            ignore_index, 'numeric_stable_mode', True, 'axis', axis,
+            'use_softmax', use_softmax)
+
         if weight is not None:
-            weight_gather = core.ops.gather_nd(
-                weight, label)  #trans weight from class to sample, shape:N
-            input_shape = list(label.shape)
-            weight_gather_reshape = reshape(weight_gather, shape=input_shape)
-            out = core.ops.elementwise_mul(out, weight_gather_reshape)
+
+            #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+            if soft_label == True:
+                # chajchaj:
+                # weight's shape is C, where C is class num.
+                # for 1d case: label's shape is [N,C], weight_gather's shape is N.
+                # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
+                weight_gather = paddle.matmul(
+                    x=paddle.cast(label, weight.dtype),
+                    y=weight,
+                    transpose_x=False,
+                    transpose_y=True)
+                out_shape = list(out.shape)
+                weight_gather_reshape = reshape(weight_gather, shape=out_shape)
+                out = paddle.cast(out, weight_gather_reshape.dtype)
+
+                out = core.ops.elementwise_mul(out, weight_gather_reshape)
+
+            else:
+                label_min = paddle.min(label)
+                label_max = paddle.max(label)
+                if label_min < 0 or label_max >= input.shape[-1]:
+                    raise ValueError(
+                        'Expected 0 <= label_value < class_dimension({}), but got {} <= label_value <= {} '.
+                        format(input.shape[-1],
+                               label_min.numpy(), label_max.numpy()))
+                weight_gather = core.ops.gather_nd(weight, label)
+                input_shape = list(label.shape)
+                weight_gather_reshape = reshape(
+                    weight_gather, shape=input_shape)
+                out = paddle.cast(out, weight_gather_reshape.dtype)
+                out = core.ops.elementwise_mul(out, weight_gather_reshape)
 
         if reduction == "sum":
-            #   because of softmax_with_cross_entropy op's inner logic, 
+            #   because of fluid_softmax_with_cross_entropy op's inner logic, 
             #   in the out tensor of this op, the loss of sample with class_index==ignore_index is 0
             #   so, reduce_sum all directly is ok
             return core.ops.reduce_sum(out, 'reduce_all', True)
         elif reduction == "mean":
             #1. if weight==none, 
-            #    numerator: reduce_sum all loss directly is ok causeof softmax_with_cross_entropy's inner logic
+            #    numerator: reduce_sum all loss directly is ok causeof fluid_softmax_with_cross_entropy's inner logic
             #    denominator: count sample num with class_index!=ignore_index
             #2. else
             #    numerator: loss's weighted sum 
@@ -1247,23 +1443,23 @@ def cross_entropy(input,
                 #mask[i]=0, if label[i]==ignore_index
                 #mask[i]=1, otherwise 
                 mask = (label != ignore_index)
-                if (weight is None):
+                if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
                     count = core.ops.reduce_sum(mask, 'reduce_all', True)
-                    ret = out_sum / count
+                    ret = out_sum / (count + (count == 0.0))
                 else:
                     mask = paddle.cast(mask, weight_gather_reshape.dtype)
                     weight_ignored = core.ops.elementwise_mul(
                         mask, weight_gather_reshape)
                     weight_sum = core.ops.reduce_sum(weight_ignored,
                                                      'reduce_all', True)
-                    ret = out_sum / weight_sum
+                    ret = out_sum / (weight_sum + (weight_sum == 0.0))
                 return ret
             elif weight is not None:
                 out_sum = core.ops.reduce_sum(out, 'reduce_all', True)
                 total_weight = core.ops.reduce_sum(weight_gather_reshape,
                                                    'reduce_all', True)
-                return out_sum / total_weight
+                return out_sum / (total_weight + (total_weight == 0.0))
             else:
                 return core.ops.mean(out)
 
@@ -1277,20 +1473,48 @@ def cross_entropy(input,
     fluid.data_feeder.check_variable_and_dtype(
         label, 'label', ['int32', 'int64', 'float32', 'float64'],
         'softmax_cross_entropy')
-    out = softmax_with_cross_entropy(
-        input,
-        label,
-        soft_label=soft_label,
-        ignore_index=ignore_index,
-        axis=axis)
+    attrs = {
+        'soft_label': soft_label,
+        'ignore_index': ignore_index,
+        'numeric_stable_mode': True,
+        'axis': axis,
+        'use_softmax': use_softmax
+    }
+    helper = LayerHelper('softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=input.dtype)
+    out = helper.create_variable_for_type_inference(dtype=input.dtype)
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': input,
+                'Label': label},
+        outputs={'Softmax': softmax,
+                 'Loss': out},
+        attrs=attrs)
+
     if weight is not None:
         fluid.data_feeder.check_variable_and_dtype(
             weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy')
         weight_name = name if reduction == 'none' else None
-        weight_gather = paddle.gather_nd(
-            weight, label)  #trans weight from class to sample, shape:N
-        input_shape = list(label.shape)
-        weight_gather_reshape = reshape(weight_gather, shape=input_shape)
+        if soft_label == True:
+            # chajchaj:
+            #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+            # weight's shape is C, where C is class num.
+            # for 1d case: label's shape is [N,C], weight_gather's shape is N.
+            # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
+            weight_gather = paddle.matmul(
+                x=paddle.cast(label, weight.dtype),
+                y=weight,
+                transpose_x=False,
+                transpose_y=True)
+
+            out_shape = list(out.shape)
+            weight_gather_reshape = reshape(weight_gather, shape=out_shape)
+            out = paddle.cast(out, weight_gather_reshape.dtype)
+        else:
+            weight_gather = paddle.gather_nd(
+                weight, label)  #trans weight from class to sample, shape:N
+            input_shape = list(label.shape)
+            weight_gather_reshape = reshape(weight_gather, shape=input_shape)
         out = paddle.multiply(out, weight_gather_reshape, name=weight_name)
 
     if reduction == "sum":
@@ -1305,17 +1529,17 @@ def cross_entropy(input,
             if (weight is None):
                 mask = paddle.cast(mask, dtype=out_sum.dtype)
                 count = paddle.sum(mask, name=name)
-                ret = out_sum / count
+                ret = out_sum / (count + (count == 0.0))
             else:
                 mask = paddle.cast(mask, weight_gather_reshape.dtype)
                 weight_ignored = paddle.multiply(mask, weight_gather_reshape)
                 weight_sum = paddle.sum(weight_ignored, name=name)
-                ret = out_sum / weight_sum
+                ret = out_sum / (weight_sum + (weight_sum == 0.0))
             return ret
         elif weight is not None:
             out_sum = paddle.sum(out, name=name)
             total_weight = paddle.sum(weight_gather_reshape)
-            return out_sum / total_weight
+            return out_sum / (total_weight + (total_weight == 0.0))
         else:
             return paddle.mean(out, name=name)
 
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 03ba78e12f6376ce0dfd924e4be7b35229d46e45..20e3254638997c99dbb1dd1d44875707d4004167 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -22,16 +22,9 @@ from ...framework import create_parameter
 from ...fluid.initializer import Constant
 from ...fluid.param_attr import ParamAttr
 from ...fluid import core, dygraph_utils
+import numbers
 
-__all__ = [
-    'batch_norm',
-    #       'data_norm',
-    'instance_norm',
-    'layer_norm',
-    'local_response_norm',
-    'normalize',
-    #       'spectral_norm'
-]
+__all__ = []
 
 
 def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
@@ -188,10 +181,10 @@ def batch_norm(x,
 
     if in_dygraph_mode():
         # for dygraph need tuple
-        attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout",
-                 data_format, "use_mkldnn", False, "fuse_with_relu", False,
-                 "use_global_stats", use_global_stats, "trainable_statistics",
-                 trainable_statistics)
+        attrs = ("momentum", momentum, "epsilon", epsilon, "is_test",
+                 not training, "data_layout", data_format, "use_mkldnn", False,
+                 "fuse_with_relu", False, "use_global_stats", use_global_stats,
+                 "trainable_statistics", trainable_statistics)
         batch_norm_out, _, _, _, _, _ = core.ops.batch_norm(
             x, weight, bias, running_mean, running_var, mean_out, variance_out,
             *attrs)
@@ -205,6 +198,7 @@ def batch_norm(x,
     attrs = {
         "momentum": momentum,
         "epsilon": epsilon,
+        "is_test": not training,
         "data_layout": data_format,
         "use_mkldnn": False,
         "fuse_with_relu": False,
@@ -222,24 +216,27 @@ def batch_norm(x,
 
     helper = LayerHelper('batch_norm', **locals())
 
-    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    param_dtype = x.dtype if x.dtype is not 'float16' else 'float32'
     saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    batch_norm_out = helper.create_variable_for_type_inference(dtype)
-    reserve_space = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     outputs = {
         "Y": [batch_norm_out],
         "MeanOut": [running_mean],
         "VarianceOut": [running_var],
         "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance],
-        "ReserveSpace": [reserve_space]
+        "SavedVariance": [saved_variance]
     }
 
+    if training or trainable_statistics:
+        # reserve_space is only used for training.
+        reserve_space = helper.create_variable_for_type_inference(
+            dtype=x.dtype, stop_gradient=True)
+        outputs["ReserveSpace"] = [reserve_space]
+
     helper.append_op(
         type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
 
@@ -285,6 +282,14 @@ def layer_norm(x,
     """
     input_shape = list(x.shape)
     input_ndim = len(input_shape)
+    if isinstance(normalized_shape, numbers.Integral):
+        normalized_shape = [normalized_shape]
+    elif isinstance(normalized_shape, tuple):
+        normalized_shape = list(normalized_shape)
+    elif not isinstance(normalized_shape, list):
+        raise ValueError(
+            "`normalized_shape` should be int, list of ints or tuple of ints.")
+
     normalized_ndim = len(normalized_shape)
     begin_norm_axis = input_ndim - normalized_ndim
     if input_ndim < normalized_ndim or input_shape[
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 5f3642710ae0adfbdb53f7b5adc81c8b8395a924..1869ac15b17a3bdedb27605e377753198a6fdca0 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -18,20 +18,7 @@ from ...fluid.framework import in_dygraph_mode
 from ...fluid.layers import utils, LayerHelper, unsqueeze, squeeze
 from ...fluid.data_feeder import check_type, check_variable_and_dtype
 
-__all__ = [
-    'avg_pool1d',
-    'avg_pool2d',
-    'avg_pool3d',
-    'max_pool1d',
-    'max_pool2d',
-    'max_pool3d',
-    'adaptive_avg_pool1d',
-    'adaptive_avg_pool2d',
-    'adaptive_avg_pool3d',
-    'adaptive_max_pool1d',
-    'adaptive_max_pool2d',
-    'adaptive_max_pool3d',
-]
+__all__ = []
 
 
 def _is_list_or_tuple(input):
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 9e04095e7b7988a95f523aac475f7e3f91502b5b..a2218a6e1aa69b168d4ada5b574428d13ebdf2ca 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -19,42 +19,7 @@ from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid import dygraph_utils
 import numpy as np
 
-# TODO: define specitial functions used in computer vision task  
-# from ...fluid.layers import affine_channel  #DEFINE_ALIAS
-# from ...fluid.layers import anchor_generator  #DEFINE_ALIAS
-# from ...fluid.layers import bipartite_match  #DEFINE_ALIAS
-# from ...fluid.layers import box_clip  #DEFINE_ALIAS
-# from ...fluid.layers import box_coder  #DEFINE_ALIAS
-# from ...fluid.layers import box_decoder_and_assign  #DEFINE_ALIAS
-# from ...fluid.layers import collect_fpn_proposals  #DEFINE_ALIAS
-# from ...fluid.layers import deformable_roi_pooling  #DEFINE_ALIAS
-# from ...fluid.layers import density_prior_box  #DEFINE_ALIAS
-# from ...fluid.layers import detection_output  #DEFINE_ALIAS
-# from ...fluid.layers import distribute_fpn_proposals  #DEFINE_ALIAS
-# from ...fluid.layers import generate_mask_labels  #DEFINE_ALIAS
-# from ...fluid.layers import generate_proposal_labels  #DEFINE_ALIAS
-# from ...fluid.layers import generate_proposals  #DEFINE_ALIAS
-# from ...fluid.layers import image_resize  #DEFINE_ALIAS
-# from ...fluid.layers import prior_box  #DEFINE_ALIAS
-# from ...fluid.layers import prroi_pool  #DEFINE_ALIAS
-# from ...fluid.layers import psroi_pool  #DEFINE_ALIAS
-# from ...fluid.layers import resize_bilinear  #DEFINE_ALIAS
-# from ...fluid.layers import resize_nearest  #DEFINE_ALIAS
-# from ...fluid.layers import resize_trilinear  #DEFINE_ALIAS
-# from ...fluid.layers import roi_align  #DEFINE_ALIAS
-# from ...fluid.layers import roi_pool  #DEFINE_ALIAS
-# from ...fluid.layers import space_to_depth  #DEFINE_ALIAS
-# from ...fluid.layers import yolo_box  #DEFINE_ALIAS
-# from ...fluid.layers import yolov3_loss  #DEFINE_ALIAS
-# from ...fluid.layers import fsp_matrix  #DEFINE_ALIAS
-# from ...fluid.layers import image_resize_short  #DEFINE_ALIAS
-# from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
-# from ...fluid.layers import retinanet_detection_output  #DEFINE_ALIAS
-# from ...fluid.layers import retinanet_target_assign  #DEFINE_ALIAS
-# from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
-# from ...fluid.layers import shuffle_channel  #DEFINE_ALIAS
-
-__all__ = ['affine_grid', 'grid_sample', 'pixel_shuffle']
+__all__ = []
 
 
 def affine_grid(theta, out_shape, align_corners=True, name=None):
@@ -119,6 +84,8 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
         use_cudnn = True
     else:
         use_cudnn = False
+    if core.is_compiled_with_rocm():
+        use_cudnn = False  # ROCM platform do not have MIOPEN kernel for affine_grid
 
     if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
             isinstance(out_shape, Variable)):
@@ -298,8 +265,9 @@ def grid_sample(x,
 
     cudnn_version = get_cudnn_version()
     use_cudnn = False
-    if (cudnn_version is not None
-        ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
+    if not core.is_compiled_with_rocm() and (
+            cudnn_version is not None
+    ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
         use_cudnn = True
         # CUDNN always computes gradients for all inputs
         x.stop_gradient = False
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index c128a1b401b2d85c8076d773c441c2182b8327a4..03e91f80dd139ced85c8a0e011d776e80d43f5a8 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -13,36 +13,34 @@
 # limitations under the License.
 
 # TODO: define the initializers to create a Parameter in neural network
-from ...fluid.initializer import Bilinear  #DEFINE_ALIAS
-from ...fluid.initializer import set_global_initializer  #DEFINE_ALIAS
+from ...fluid.initializer import Bilinear  # noqa: F401
+from ...fluid.initializer import set_global_initializer  # noqa: F401
 
-from . import constant
-from .constant import Constant  #DEFINE_ALIAS
+from .constant import Constant  # noqa: F401
 
-from . import kaiming
-from .kaiming import KaimingNormal  #DEFINE_ALIAS
-from .kaiming import KaimingUniform  #DEFINE_ALIAS
+from .kaiming import KaimingNormal  # noqa: F401
+from .kaiming import KaimingUniform  # noqa: F401
 
-__all__ = ['Bilinear', 'set_global_initializer']
+from .xavier import XavierNormal  # noqa: F401
+from .xavier import XavierUniform  # noqa: F401
 
-__all__ += constant.__all__
-__all__ += kaiming.__all__
+from .assign import Assign  # noqa: F401
 
-from . import xavier
-from .xavier import XavierNormal  #DEFINE_ALIAS
-from .xavier import XavierUniform  #DEFINE_ALIAS
+from .normal import Normal  # noqa: F401
+from .normal import TruncatedNormal  # noqa: F401
 
-from . import assign
-from .assign import Assign  #DEFINE_ALIAS
+from .uniform import Uniform  # noqa: F401
 
-from . import normal
-from .normal import Normal  #DEFINE_ALIAS
-from .normal import TruncatedNormal  #DEFINE_ALIAS
-
-from . import uniform
-from .uniform import Uniform  #DEFINE_ALIAS
-
-__all__ += xavier.__all__
-__all__ += assign.__all__
-__all__ += normal.__all__
-__all__ += uniform.__all__
+__all__ = [     #noqa
+           'Bilinear',
+           'Constant',
+           'KaimingUniform',
+           'KaimingNormal',
+           'XavierNormal',
+           'XavierUniform',
+           'Assign',
+           'Normal',
+           'TruncatedNormal',
+           'Uniform',
+           'set_global_initializer'
+]
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index a33301230e89e14b4d0d7c87bf7fa2dcc55ef179..13a70a179ffe38e2f2ef4b335657d7e45d1bb84a 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -19,14 +19,14 @@ from ...fluid.core import VarDesc
 from ...fluid.data_feeder import check_type
 from ...fluid.initializer import NumpyArrayInitializer
 
-__all__ = ['Assign']
+__all__ = []
 
 
 class Assign(NumpyArrayInitializer):
     """Init an parameter with a numpy array, list, or tensor.
 
     Args:
-        value (Tensor|numpy.ndarray|list): numpy array, list, or tensor to initialize the parameter.
+        value (Tensor|numpy.ndarray|list|tuple): numpy array, list, tuple, or tensor to initialize the parameter.
         name(str, optional): The default value is None. Normally there is no need for user to set this
             property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -87,10 +87,10 @@ class Assign(NumpyArrayInitializer):
 
     def __init__(self, value, name=None):
         import numpy
-        check_type(value, 'value', (numpy.ndarray, list, framework.Variable),
-                   'Assign')
+        check_type(value, 'value',
+                   (numpy.ndarray, list, tuple, framework.Variable), 'Assign')
 
-        if (isinstance(value, list)):
+        if (isinstance(value, (list, tuple))):
             value = numpy.array(value)
 
         # TODO: value is already is a tensor, accounting efficiency maybe it does not need to convert tensor to numpy data and then initialized.
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index 6d21ddae0d16b5003bc6766b4106dd937727c2b1..292eaff362b40773fb2980174b146cb506d54a9a 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -15,7 +15,7 @@
 # TODO: define the initializers of Constant in neural network
 from ...fluid.initializer import ConstantInitializer
 
-__all__ = ['Constant']
+__all__ = []
 
 
 class Constant(ConstantInitializer):
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index 7e2b6f787f85316c9ad4c3bedf91eef3b19cd50d..f0847c85237b2523b74c8a3323550a56233ac6f7 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -15,7 +15,7 @@
 # TODO: define the initializers of Kaiming functions in neural network
 from ...fluid.initializer import MSRAInitializer
 
-__all__ = ['KaimingUniform', 'KaimingNormal']
+__all__ = []
 
 
 class KaimingNormal(MSRAInitializer):
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index a572d0e2c9216040e3ffa7e1c02841ebc1fc33ae..6fee5058057cb0f332a38350958c48d235459398 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -15,7 +15,7 @@
 from ...fluid.initializer import NormalInitializer
 from ...fluid.initializer import TruncatedNormalInitializer
 
-__all__ = ['Normal', 'TruncatedNormal']
+__all__ = []
 
 
 class Normal(NormalInitializer):
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index a5d7d34efcf664a5bd46d7d3f06e2c542a8b4ef9..cac03b59480712db3a1fb8b826958bf513277eb5 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -14,7 +14,7 @@
 
 from ...fluid.initializer import UniformInitializer
 
-__all__ = ['Uniform']
+__all__ = []
 
 
 class Uniform(UniformInitializer):
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 821a6984753105162e878c879cd5b960d2aa80e1..f2d5593032f64d0759515c733c733406fb14bd59 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -14,7 +14,7 @@
 
 from ...fluid.initializer import XavierInitializer
 
-__all__ = ['XavierNormal', 'XavierUniform']
+__all__ = []
 
 
 class XavierNormal(XavierInitializer):
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 13fdde070874aba31aef4cbe4c47bd0afdeb070c..10c2b1e3056f15e2df3141ec2c1e7387eae3048d 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -14,88 +14,72 @@
 
 # TODO: define activation functions of neural network
 
-from . import activation
-from . import loss
-from . import conv
-from . import activation
-from . import norm
-from . import rnn
-from . import vision
-from . import distance
-from . import transformer
+from . import rnn  # noqa: F401
+from . import transformer  # noqa: F401
+from . import container  # noqa: F401
 
-from .activation import *
-from .loss import *
-from .conv import *
-from .activation import *
-from .norm import *
-from .rnn import *
-from .vision import *
+from .activation import PReLU  # noqa: F401
+from .activation import ReLU  # noqa: F401
+from .activation import ReLU6  # noqa: F401
+from .activation import LeakyReLU  # noqa: F401
+from .activation import Sigmoid  # noqa: F401
+from .activation import Softmax  # noqa: F401
+from .activation import LogSoftmax  # noqa: F401
+from .common import Bilinear  # noqa: F401
+from .common import Pad1D  # noqa: F401
+from .common import Pad2D  # noqa: F401
+from .common import Pad3D  # noqa: F401
+from .common import CosineSimilarity  # noqa: F401
+from .common import Embedding  # noqa: F401
+from .common import Linear  # noqa: F401
+from .common import Flatten  # noqa: F401
+from .common import Upsample  # noqa: F401
+from .common import Dropout  # noqa: F401
+from .common import Dropout2D  # noqa: F401
+from .common import Dropout3D  # noqa: F401
+from .common import AlphaDropout  # noqa: F401
+from .common import Upsample  # noqa: F401
+from .common import UpsamplingBilinear2D  # noqa: F401
+from .common import UpsamplingNearest2D  # noqa: F401
+from .pooling import AvgPool1D  # noqa: F401
+from .pooling import AvgPool2D  # noqa: F401
+from .pooling import AvgPool3D  # noqa: F401
+from .pooling import MaxPool1D  # noqa: F401
+from .pooling import MaxPool2D  # noqa: F401
+from .pooling import MaxPool3D  # noqa: F401
+from .pooling import AdaptiveAvgPool1D  # noqa: F401
+from .pooling import AdaptiveAvgPool2D  # noqa: F401
+from .pooling import AdaptiveAvgPool3D  # noqa: F401
+from .pooling import AdaptiveMaxPool1D  # noqa: F401
+from .pooling import AdaptiveMaxPool2D  # noqa: F401
+from .pooling import AdaptiveMaxPool3D  # noqa: F401
+from .conv import Conv1D  # noqa: F401
+from .conv import Conv2D  # noqa: F401
+from .conv import Conv3D  # noqa: F401
+from .conv import Conv1DTranspose  # noqa: F401
+from .conv import Conv2DTranspose  # noqa: F401
+from .conv import Conv3DTranspose  # noqa: F401
+from .loss import BCEWithLogitsLoss  # noqa: F401
+from .loss import CrossEntropyLoss  # noqa: F401
+from .loss import MSELoss  # noqa: F401
+from .loss import L1Loss  # noqa: F401
+from .loss import NLLLoss  # noqa: F401
+from .loss import BCELoss  # noqa: F401
+from .loss import KLDivLoss  # noqa: F401
+from .loss import MarginRankingLoss  # noqa: F401
+from .loss import CTCLoss  # noqa: F401
+from .loss import SmoothL1Loss  # noqa: F401
+from .norm import BatchNorm1D  # noqa: F401
+from .norm import BatchNorm2D  # noqa: F401
+from .norm import BatchNorm3D  # noqa: F401
+from .norm import SyncBatchNorm  # noqa: F401
+from .norm import GroupNorm  # noqa: F401
+from .norm import LayerNorm  # noqa: F401
+from .norm import SpectralNorm  # noqa: F401
+from .norm import LocalResponseNorm  # noqa: F401
 
-from .transformer import *
-from .activation import PReLU  #DEFINE_ALIAS
-from .activation import ReLU  #DEFINE_ALIAS
-from .activation import LeakyReLU  #DEFINE_ALIAS
-from .activation import Sigmoid  #DEFINE_ALIAS
-from .activation import Softmax  #DEFINE_ALIAS
-from .activation import LogSoftmax  #DEFINE_ALIAS
-from .common import Bilinear  #DEFINE_ALIAS
-from .common import Pad1D  #DEFINE_ALIAS
-from .common import Pad2D  #DEFINE_ALIAS
-from .common import Pad3D  #DEFINE_ALIAS
-from .common import CosineSimilarity  #DEFINE_ALIAS
-from .common import Embedding  #DEFINE_ALIAS
-from .common import Linear  #DEFINE_ALIAS
-from .common import Flatten  #DEFINE_ALIAS
-from .common import Upsample  #DEFINE_ALIAS
-from .common import Dropout  #DEFINE_ALIAS
-from .common import Dropout2D  #DEFINE_ALIAS
-from .common import Dropout3D  #DEFINE_ALIAS
-from .common import AlphaDropout  #DEFINE_ALIAS
-from .common import Upsample  #DEFINE_ALIAS
-from .common import UpsamplingBilinear2D  #DEFINE_ALIAS
-from .common import UpsamplingNearest2D  #DEFINE_ALIAS
-from .pooling import AvgPool1D  #DEFINE_ALIAS
-from .pooling import AvgPool2D  #DEFINE_ALIAS
-from .pooling import AvgPool3D  #DEFINE_ALIAS
-from .pooling import MaxPool1D  #DEFINE_ALIAS
-from .pooling import MaxPool2D  #DEFINE_ALIAS
-from .pooling import MaxPool3D  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool1D  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool2D  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool3D  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool1D  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool2D  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool3D  #DEFINE_ALIAS
-from .conv import Conv1D  #DEFINE_ALIAS
-from .conv import Conv2D  #DEFINE_ALIAS
-from .conv import Conv3D  #DEFINE_ALIAS
-from .conv import Conv1DTranspose  #DEFINE_ALIAS
-from .conv import Conv2DTranspose  #DEFINE_ALIAS
-from .conv import Conv3DTranspose  #DEFINE_ALIAS
-# from .conv import TreeConv        #DEFINE_ALIAS
-# from .conv import Conv1D        #DEFINE_ALIAS
-# from .loss import NCELoss        #DEFINE_ALIAS
-from .loss import BCEWithLogitsLoss  #DEFINE_ALIAS
-from .loss import CrossEntropyLoss  #DEFINE_ALIAS
-from .loss import MSELoss  #DEFINE_ALIAS
-from .loss import L1Loss  #DEFINE_ALIAS
-from .loss import NLLLoss  #DEFINE_ALIAS
-from .loss import BCELoss  #DEFINE_ALIAS
-from .loss import KLDivLoss  #DEFINE_ALIAS
-from .loss import MarginRankingLoss  #DEFINE_ALIAS
-from .loss import CTCLoss  #DEFINE_ALIAS
-from .loss import SmoothL1Loss  #DEFINE_ALIAS
-from .norm import BatchNorm  #DEFINE_ALIAS
-from .norm import SyncBatchNorm  #DEFINE_ALIAS
-from .norm import GroupNorm  #DEFINE_ALIAS
-from .norm import LayerNorm  #DEFINE_ALIAS
-from .norm import SpectralNorm  #DEFINE_ALIAS
-#from .norm import InstanceNorm  #DEFINE_ALIAS
-from .norm import LocalResponseNorm  #DEFINE_ALIAS
-# from .rnn import RNNCell        #DEFINE_ALIAS
-# from .rnn import GRUCell        #DEFINE_ALIAS
-# from .rnn import LSTMCell        #DEFINE_ALIAS
+from .vision import PixelShuffle  # noqa: F401
+from .distance import PairwiseDistance  # noqa: F401
+from .container import LayerDict  # noqa: F401
 
-from .vision import PixelShuffle  #DEFINE_ALIAS
-from .distance import PairwiseDistance  #DEFINE_ALIAS
+__all__ = []
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 69cdb7381716b5a1866a8519b4ee7662dfb7b2bb..d5b37144cfffed55396787cc7745ea7b80639672 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -14,32 +14,6 @@
 
 # TODO: define activation functions of neural network
 
-__all__ = [
-    'ELU',
-    'GELU',
-    'Hardshrink',
-    'Hardswish',
-    'Tanh',
-    'Hardtanh',
-    'PReLU',
-    'ReLU',
-    'ReLU6',
-    'SELU',
-    'LeakyReLU',
-    'Sigmoid',
-    'Hardsigmoid',
-    'Softmax',
-    'Softplus',
-    'Softshrink',
-    'Softsign',
-    'Swish',
-    'Tanhshrink',
-    'ThresholdedReLU',
-    'LogSigmoid',
-    'LogSoftmax',
-    'Maxout',
-]
-
 from ...fluid.dygraph import layers
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
@@ -48,6 +22,8 @@ from ...fluid.initializer import Constant
 from paddle.framework import get_default_dtype
 from .. import functional as F
 
+__all__ = []
+
 
 class ELU(layers.Layer):
     r"""
@@ -919,6 +895,44 @@ class ThresholdedReLU(layers.Layer):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
+class Silu(layers.Layer):
+    """
+    Silu Activation.
+    .. math::
+
+        Silu(x) = \frac{x}{1 + e^{-x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, or float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            m = paddle.nn.Silu()
+            out = m(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+    """
+
+    def __init__(self, name=None):
+        super(Silu, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.silu(x, self._name)
+
+    def extra_repr(self):
+        name_str = 'name={}'.format(self._name) if self._name else ''
+        return name_str
+
+
 class LogSigmoid(layers.Layer):
     r"""
     LogSigmoid Activation.
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index d0f97625bcba757ad80cf897a1aa9dffeb88641f..f608f20feef55ef526884eb10eff44cdcdae93ec 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -14,28 +14,13 @@
 
 # TODO: define the common classes to build a neural network
 import paddle
-from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
+from ...fluid.dygraph import Flatten  # noqa: F401
 from ...fluid.dygraph import layers
 from ...fluid.framework import in_dygraph_mode
 from .. import functional as F
 from ...fluid.framework import _dygraph_tracer
 
-__all__ = [
-    'Embedding',
-    'Linear',
-    'Upsample',
-    'Pad1D',
-    'Pad2D',
-    'Pad3D',
-    'UpsamplingNearest2D',
-    'UpsamplingBilinear2D',
-    'CosineSimilarity',
-    'Dropout',
-    'Dropout2D',
-    'Dropout3D',
-    'Bilinear',
-    'AlphaDropout',
-]
+__all__ = []
 
 
 def _npairs(x, n):
@@ -299,7 +284,7 @@ class Upsample(layers.Layer):
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -418,7 +403,7 @@ class UpsamplingNearest2D(layers.Layer):
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor.
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -505,7 +490,7 @@ class UpsamplingBilinear2D(layers.Layer):
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor.
-             Default: None. If a list, each element can be an integer or a Tensor  of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor  of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -578,7 +563,7 @@ class Bilinear(layers.Layer):
 
     .. math::
 
-      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1
+      out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,outfeatures-1
 
       out = out + b
 
@@ -586,7 +571,7 @@ class Bilinear(layers.Layer):
      - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features].
      - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features].
      - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features].
-     - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features].
+     - :math:`out_{i}`: the i-th element of out, shape is [batch_size], and out's shape is [batch_size, out_features].
      - :math:`b`: the learned bias, shape is [1, out_features].
      - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`.
 
@@ -679,8 +664,8 @@ class Dropout(layers.Layer):
     In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
 
     Parameters:
-        p (float | int): Probability of setting units to zero. Default: 0.5
-        axis (int | list): The axis along which the dropout is performed. Default None.
+        p (float|int): Probability of setting units to zero. Default: 0.5
+        axis (int|list|tuple): The axis along which the dropout is performed. Default None.
         mode(str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
 
                                1. upscale_in_train(default), upscale the output at training time
@@ -1229,7 +1214,7 @@ class Embedding(layers.Layer):
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
     This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
     It automatically constructs a 2D embedding matrix based on the
-    input :attr:`num_embeddings` and attr:`embedding_dim`.
+    input :attr:`num_embeddings` and :attr:`embedding_dim`.
 
     The shape of output Tensor is generated by appending an emb_size dimension to the
     last dimension of the input Tensor shape.
@@ -1241,9 +1226,9 @@ class Embedding(layers.Layer):
 
         Case 1:
 
-        input is a Tensor. padding_idx = -1
-            input.data = [[1, 3], [2, 4], [4, 127]
-            input.shape = [3, 2]
+        x is a Tensor. padding_idx = -1
+            x.data = [[1, 3], [2, 4], [4, 127]
+            x.shape = [3, 2]
         Given size = [128, 16]
         output is a Tensor:
             out.shape = [3, 2, 16]
@@ -1261,7 +1246,7 @@ class Embedding(layers.Layer):
     Parameters:
         num_embeddings (int): Just one element which indicate the size
             of the dictionary of embeddings.
-        embedding_dim:  Just one element which indicate the size of each embedding vector respectively.
+        embedding_dim (int):  Just one element which indicate the size of each embedding vector respectively.
         padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
@@ -1270,9 +1255,7 @@ class Embedding(layers.Layer):
         sparse(bool): The flag indicating whether to use sparse update. This parameter only
             affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizer does not support sparse update,
-            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
-            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
-            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`.
             In these case, sparse must be False. Default: False.
         weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
@@ -1382,3 +1365,73 @@ class Embedding(layers.Layer):
         if self._name is not None:
             main_str += ', name={_name}'
         return main_str.format(**self.__dict__)
+
+
+class Unfold(layers.Layer):
+    """
+    This op returns a col buffer of sliding local blocks of input x, also known
+    as im2col for batched 2D image tensors. For each block under the convolution filter,
+    all element will be rearranged as a column. While the convolution filter sliding over
+    the input feature map, a series of such columns will be formed.
+
+    For each input :math:`x` with shape [N, C, H, W], the output shape [N, Cout, Lout]
+    can be calculated as following.
+
+    See ``paddle.nn.functional.unfold`` for more details.
+
+    
+    Parameters:
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            x = paddle.randn((100,3,224,224))
+            unfold = nn.Unfold(kernel_sizes=[3, 3])
+            result = unfold(x)
+            print(result)
+   """
+
+    def __init__(self,
+                 kernel_sizes,
+                 dilations=1,
+                 paddings=0,
+                 strides=1,
+                 name=None):
+        super(Unfold, self).__init__()
+
+        self.kernel_sizes = kernel_sizes
+        self.dilations = dilations
+        self.paddings = paddings
+        self.strides = strides
+        self.name = name
+
+    def forward(self, input):
+        return F.unfold(input, self.kernel_sizes, self.dilations, self.paddings,
+                        self.strides, self.name)
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
+                format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad41535f44ad6a76c9b4cfbfad8644e9a8c85578
--- /dev/null
+++ b/python/paddle/nn/layer/container.py
@@ -0,0 +1,294 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from ...fluid.dygraph.layers import Layer
+from six.moves import collections_abc
+
+__all__ = []
+
+
+class LayerDict(Layer):
+    """
+    LayerDict holds sublayers in the ordered dictionary, and sublayers it contains are properly registered.
+    Holded sublayers can be accessed like a regular ordered python dictionary. 
+
+    Parameters:
+        sublayers (LayerDict|OrderedDict|list[(key,Layer)...], optional): iterable of key/value pairs, the type of value is 'paddle.nn.Layer' .
+
+    Examplex:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            from collections import OrderedDict
+
+            sublayers = OrderedDict([
+                ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+            ])
+
+            layers_dict = paddle.nn.LayerDict(sublayers=sublayers)
+
+            l = layers_dict['conv1d']
+
+            for k in layers_dict:
+                l = layers_dict[k]
+
+            len(layers_dict)
+            #3
+
+            del layers_dict['conv2d']
+            len(layers_dict)
+            #2
+
+            conv1d = layers_dict.pop('conv1d')
+            len(layers_dict)
+            #1
+
+            layers_dict.clear()
+            len(layers_dict)
+            #0
+
+    """
+
+    def __init__(self, sublayers=None):
+        super(LayerDict, self).__init__()
+        if sublayers is not None:
+            self.update(sublayers)
+
+    def __getitem__(self, key):
+        return self._sub_layers[key]
+
+    def __setitem__(self, key, sublayer):
+        return self.add_sublayer(key, sublayer)
+
+    def __delitem__(self, key):
+        del self._sub_layers[key]
+
+    def __len__(self):
+        return len(self._sub_layers)
+
+    def __iter__(self):
+        return iter(self._sub_layers)
+
+    def __contains__(self, key):
+        return key in self._sub_layers
+
+    def clear(self):
+        """
+        Clear all the sublayers in the LayerDict.
+
+        Parameters:
+            None.
+
+        Examplex:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                len(layer_dict)
+                #3
+
+                layer_dict.clear()
+                len(layer_dict)
+                #0
+
+        """
+        self._sub_layers.clear()
+
+    def pop(self, key):
+        """
+        Remove the key from the LayerDict and return the layer of the key.
+
+        Parameters:
+            key (str): the key to be removed.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                len(layer_dict)
+                #3
+
+                layer_dict.pop('conv2d')
+                len(layer_dict)
+                #2
+
+        """
+        v = self[key]
+        del self[key]
+        return v
+
+    def keys(self):
+        """
+        Return the iterable of the keys in LayerDict.
+
+        Parameters:
+            None.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                for k in layer_dict.keys():
+                    print(k)
+                
+                #conv1d
+                #conv2d
+                #conv3d
+
+        """
+        return self._sub_layers.keys()
+
+    def items(self):
+        """
+        Return the iterable of the key/value pairs in LayerDict.
+
+        Parameters:
+            None.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                for k, v in layer_dict.items():
+                    print(k, ":", v)
+
+                #conv1d : Conv1D(3, 2, kernel_size=[3], data_format=NCL)
+                #conv2d : Conv2D(3, 2, kernel_size=[3, 3], data_format=NCHW)
+                #conv3d : Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)
+
+        """
+        return self._sub_layers.items()
+
+    def values(self):
+        """
+        Return the iterable of the values in LayerDict.
+
+        Parameters:
+            None.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+                for v in layer_dict.values():
+                    print(v)
+
+                #Conv1D(3, 2, kernel_size=[3], data_format=NCL)
+                #Conv2D(3, 2, kernel_size=[3, 3], data_format=NCHW)
+                #Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)
+
+        """
+        return self._sub_layers.values()
+
+    def update(self, sublayers):
+        """
+        Update the key/values pairs in sublayers to the LayerDict, overwriting the existing keys.
+
+        Parameters:
+            sublayers (LayerDict|OrderedDict|list[(key,Layer)...]): iterable of key/value pairs, the type of value is 'paddle.nn.Layer' .
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                from collections import OrderedDict
+
+                sublayers = OrderedDict([
+                    ('conv1d', paddle.nn.Conv1D(3, 2, 3)),
+                    ('conv2d', paddle.nn.Conv2D(3, 2, 3)),
+                    ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))),
+                ])
+
+                new_sublayers = OrderedDict([
+                    ('relu', paddle.nn.ReLU()),
+                    ('conv2d', paddle.nn.Conv2D(4, 2, 4)),
+                ])
+                layer_dict = paddle.nn.LayerDict(sublayers=sublayers)
+
+                layer_dict.update(new_sublayers)
+                
+                for k, v in layer_dict.items():
+                    print(k, ":", v)
+                #conv1d : Conv1D(3, 2, kernel_size=[3], data_format=NCL)
+                #conv2d : Conv2D(4, 2, kernel_size=[4, 4], data_format=NCHW)
+                #conv3d : Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW)
+                #relu : ReLU()
+
+        """
+
+        assert isinstance(
+            sublayers, collections_abc.Iterable
+        ), "The type of sublayers is not iterable of key/value pairs, the type of sublayers is " + type(
+            sublayers).__name__
+
+        if isinstance(sublayers,
+                      (OrderedDict, LayerDict, collections_abc.Mapping)):
+            for key, layer in sublayers.items():
+                self.add_sublayer(key, layer)
+        else:
+            # handle this format [(key1, layer1), (key2, layer2)...]
+            for i, kv in enumerate(sublayers):
+                if len(kv) != 2:
+                    raise ValueError("The length of the " + str(i) +
+                                     "'s element in sublayers is " + str(
+                                         len(kv)) + ", which must be 2.")
+                self.add_sublayer(kv[0], kv[1])
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 389920b923876dff9d6c663a607e7b8752efd7f1..2de065d62a4f8ce75dd079b762d5b899bcbd4d26 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -14,17 +14,9 @@
 
 # TODO: define classes of convolutional neural network
 
-__all__ = [
-    'Conv1D',
-    'Conv2D',
-    'Conv3D',
-    'Conv1DTranspose',
-    'Conv2DTranspose',
-    'Conv3DTranspose',
-]
-
 import numpy as np
 
+from ...fluid import get_flags
 from ...fluid import core
 from ...device import get_cudnn_version
 from ...fluid.dygraph import layers
@@ -33,6 +25,8 @@ from .. import functional as F
 from ...fluid.layers import utils
 from ..functional.conv import _update_padding_nd
 
+__all__ = []
+
 
 def _get_default_param_initializer(num_channels, filter_size):
     filter_elem_num = num_channels * np.prod(filter_size)
@@ -152,6 +146,13 @@ class _ConvNd(layers.Layer):
                                           in_channels != 1 and
                                           out_channels % in_channels == 0):
             self._op_type = 'depthwise_conv2d'
+            if core.is_compiled_with_rocm():
+                self._use_cudnn = True
+            else:
+                self._use_cudnn = False
+
+        if (core.is_compiled_with_cuda() and get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
             self._use_cudnn = False
 
     def extra_repr(self):
@@ -191,7 +192,7 @@ class Conv1D(_ConvNd):
 
     .. math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -218,22 +219,22 @@ class Conv1D(_ConvNd):
 
         .. math::
 
-            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+            L_{out}&= \frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1 \\
 
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of filter. It is as same as the output
             feature map.
-        kernel_size (int|tuple|list): The filter size. If kernel_size is a tuple,
+        kernel_size (int|tuple|list): The filter size. If kernel_size is a tuple/list,
             it must contain one integer, (kernel_size).
-        stride (int|tuple|list, optional): The stride size. If stride is a tuple, it must
+        stride (int|tuple|list, optional): The stride size. If stride is a tuple/list, it must
             contain one integer, (stride_size). Default: 1.
         padding(int|str|tuple|list, optional): The size of zeros to be padded. It must be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means the feature map is zero paded by size of `padding` on both sides.
             3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
             The default value is 0.
-        dilation (int|tuple|list, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|tuple|list, optional): The dilation size. If dilation is a tuple/list, it must
             contain one integer, (dilation_size). Default: 1.
         groups (int, optional): The groups number of the conv2d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -250,7 +251,7 @@ class Conv1D(_ConvNd):
             of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+            and the :math:`std` is :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
         bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv1d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv1d
@@ -360,7 +361,7 @@ class Conv1DTranspose(_ConvNd):
 
     .. math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -396,18 +397,18 @@ class Conv1DTranspose(_ConvNd):
           so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`L_{out} = L^\prime_{out}`;
           else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
-          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+          and :math:`L^\prime_{out} + stride`.
 
     Args:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of the filter. It is as same as the output
             feature map.
-        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple,
+        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple/list,
             it must contain one integers, (kernel_size). None if
             use output size to calculate kernel_size. Default: None. kernel_size and
             output_size should not be None at the same time.
         stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
-            If stride is a tuple, it must contain one integer, (stride_size).
+            If stride is a tuple/list, it must contain one integer, (stride_size).
             Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
              `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
@@ -415,7 +416,7 @@ class Conv1DTranspose(_ConvNd):
              If `padding` is a tuple or list, it could be in two forms:
              `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
         output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
-             If it is a tuple, it must contain one integer. Default: 0.
+             If it is a tuple/list, it must contain one integer. Default: 0.
         groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
@@ -424,7 +425,7 @@ class Conv1DTranspose(_ConvNd):
             Default: groups = 1.
         bias(bool, optional): Whether to use bias. Default: True.
         dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
-            If dilation is a tuple, it must contain one integer, (dilation_size).
+            If dilation is a tuple/list, it must contain one integer, (dilation_size).
             Default: dilation = 1.
         weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv1d_transpose. If it is set to None or one attribute of ParamAttr, conv1d_transpose
@@ -443,7 +444,7 @@ class Conv1DTranspose(_ConvNd):
     Shape:
 
         - x(Tensor): 3-D tensor with shape (batch, in_channels, length) when data_format is "NCL" or shape (batch, length, in_channels) when data_format is "NLC".
-        - output_size(int|tuple|list, optional): The output image size. If output size is a tuple, it must contain one integer, (feature_length). None if use kernel_size, padding, output_padding and stride to calculate output_size. If output_size and kernel_size are specified at the same time, They should follow the formula above. Default: None. output_size and kernel_size should not be None at the same time.
+        - output_size(int|tuple|list, optional): The output image size. If output size is a tuple/list, it must contain one integer, (feature_length). None if use kernel_size, padding, output_padding and stride to calculate output_size. If output_size and kernel_size are specified at the same time, They should follow the formula above. Default: None. output_size and kernel_size should not be None at the same time.
         - output(Tensor): 3-D tensor with same shape as input x.
 
     Examples:
@@ -532,7 +533,7 @@ class Conv2D(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -547,7 +548,7 @@ class Conv2D(_ConvNd):
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
         kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain three integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -557,7 +558,7 @@ class Conv2D(_ConvNd):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups(int, optional): The groups number of the Conv3D Layer. According to grouped
@@ -570,7 +571,7 @@ class Conv2D(_ConvNd):
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+            :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
@@ -595,9 +596,9 @@ class Conv2D(_ConvNd):
 
         ..  math::
 
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+           H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
 
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+           W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
 
     Examples:
 
@@ -688,7 +689,7 @@ class Conv2DTranspose(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -702,10 +703,10 @@ class Conv2DTranspose(_ConvNd):
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
-        kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a list/tuple,
             it must contain two integers, (kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -717,7 +718,7 @@ class Conv2DTranspose(_ConvNd):
             The default value is 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
         groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
@@ -843,7 +844,7 @@ class Conv3D(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -858,7 +859,7 @@ class Conv3D(_ConvNd):
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
         kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -868,7 +869,7 @@ class Conv3D(_ConvNd):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups(int, optional): The groups number of the Conv3D Layer. According to grouped
@@ -881,7 +882,7 @@ class Conv3D(_ConvNd):
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+            :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d
@@ -906,11 +907,11 @@ class Conv3D(_ConvNd):
 
         ..  math::
 
-           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+           D_{out}&= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
 
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+           H_{out}&= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
 
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
+           W_{out}&= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
 
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
@@ -1002,7 +1003,7 @@ class Conv3DTranspose(_ConvNd):
     
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1029,11 +1030,11 @@ class Conv3DTranspose(_ConvNd):
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
-        kernel_size(int|list|tuple): The kernel size. If kernel_size is a tuple,
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a list/tuple,
             it must contain three integers, (kernel_size_D, kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
         stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height, 
             stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
             The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -1045,7 +1046,7 @@ class Conv3DTranspose(_ConvNd):
             The default value is 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups(int, optional): The groups number of the Conv3D transpose layer. Inspired by
@@ -1063,11 +1064,6 @@ class Conv3DTranspose(_ConvNd):
             If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        output_size(int|list|tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
         data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
 
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 72e0a1b2d6d2009e0edb2674b13299460996c104..77e3447ffda00e29567369fa1611221178746f26 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ['PairwiseDistance']
-
 import numpy as np
 
 import paddle
@@ -22,6 +20,8 @@ from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
 
+__all__ = []
+
 
 class PairwiseDistance(layers.Layer):
     r"""
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index ac1cb5a8187720292ff5e942110b6af280f6f9d6..8f43eb8866b4bb7e6d1738999b7f64335fa62185 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*
 #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,19 +21,7 @@ import paddle
 from .. import functional as F
 from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
 
-__all__ = [
-    'BCEWithLogitsLoss',
-    'CrossEntropyLoss',
-    'HSigmoidLoss',
-    'MSELoss',
-    'L1Loss',
-    'NLLLoss',
-    'BCELoss',
-    'KLDivLoss',
-    'MarginRankingLoss',
-    'CTCLoss',
-    'SmoothL1Loss',
-]
+__all__ = []
 
 
 class BCEWithLogitsLoss(fluid.dygraph.Layer):
@@ -108,7 +97,6 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
 
         .. code-block:: python
             import paddle
-            paddle.disable_static()
             logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
             label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
             bce_logit_loss = paddle.nn.BCEWithLogitsLoss()
@@ -142,85 +130,249 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
 
 class CrossEntropyLoss(fluid.dygraph.Layer):
     r"""
-    This operator implements the cross entropy loss function with softmax. This function 
+    By default, this operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable gradient.
+    to provide a more numerically stable computing.
 
-    Because this operator performs a softmax on logits internally, it expects
-    unscaled logits. This operator should not be used with the output of
-    softmax operator since that would produce incorrect results.
+    This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    one class with a probability of 1.0. Each sample in the batch will have a 
-    single label.
+    By default, this operator will calculate the mean of the result, and you can also affect 
+    the default behavior by using the reduction parameter. Please refer to the part of 
+    parameters for details.
 
-    The equation is as follows:
+    This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
-    1) Hard label (one-hot label, so every sample has exactly one class)
+    The calculation of this operator includes the following two steps.
 
-    .. math::
+    -  **I.softmax cross entropy** 
 
-        loss_j =  -\\text{logits}_{label_j} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
+        1. Hard label (each sample can only be assigned into one category)
 
-    2) Soft label (each sample can have a distribution over all classes)
+        1.1. when use_softmax=True
 
-    .. math::
+            .. math::
+              \\loss_j=-\text{logits}_{label_j}+\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right) , j = 1,...,N
 
-        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
-        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
-        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+            where, N is the number of samples and C is the number of categories.
+
+        1.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\log\left({P}_{label_j}\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+        2. Soft label (each sample is assigned to multiple categories with a certain probability, and the probability sum is 1).
+
+        2.1. when use_softmax=True
+
+            .. math::
+              \\loss_j=-\sum_{i=0}^{C}\text{label}_i\left(\text{logits}_i-\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories.
+
+        2.2. when use_softmax=False
+
+            .. math::
+              \\loss_j=-\sum_{j=0}^{C}\left({label}_j*\log\left({P}_{label_j}\right)\right) , j = 1,...,N
+
+            where, N is the number of samples and C is the number of categories, P is input(the output of softmax).
+
+
+
+    -  **II.Weight and reduction processing** 
+
+        1. Weight
+
+            If the ``weight`` parameter is ``None`` , go to the next step directly.
+
+            If the ``weight`` parameter is not ``None`` , the cross entropy of each sample is weighted by weight
+            according to soft_label = False or True as follows.
+
+            1.1. Hard labels (soft_label = False)
+
+            .. math::
+                \\loss_j=loss_j*weight[label_j] 
 
- 
-    It is useful when training a classification problem with ``C`` classes.
 
+            1.2. Soft labels (soft_label = True)
 
+             .. math::
+                \\loss_j=loss_j*\sum_{i}\left(weight[label_i]*logits_i\right)
+
+        2. reduction
+
+            2.1 if the ``reduction`` parameter is ``none`` 
+
+            Return the previous result directly
+
+            2.2 if the ``reduction`` parameter is ``sum`` 
+
+            Return the sum of the previous results
+
+            .. math::
+               \\loss=\sum_{j}loss_j
+
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
+            the ``weight`` parameter as follows. 
+
+            2.3.1. If the  ``weight``  parameter is ``None`` 
+
+            Return the average value of the previous results
+
+             .. math::
+                \\loss=\sum_{j}loss_j/N
+
+            where, N is the number of samples and C is the number of categories.
+
+            2.3.2. If the 'weight' parameter is not 'None', the weighted average value of the previous result will be returned
+
+            1. Hard labels (soft_label = False)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+
+            2. Soft labels (soft_label = True)
+
+             .. math::
+                \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
+ 
+ 
     Parameters:
-        input (Tensor): Input tensor, the data type is float32, float64. Shape is
-	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, C, D1, D2,..., Dk), k >= 1.
-        label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
-	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
-	    (N, D1, D2,..., Dk), k >= 1.
-        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
-            to each class and the shape is (C). It has the same dimensions as class
-	    number and the data type is float32, float64. Default is ``'None'``.
-        reduction (str, optional): Indicate how to average the loss by batch_size,
+
+        - **weight** (Tensor, optional)
+
+            a manual rescaling weight given to each class. 
+            If given, has to be a Tensor of size C and the data type is float32, float64. 
+            Default is ``'None'`` .
+
+        - **ignore_index** (int64, optional)
+
+            Specifies a target value that is ignored
+            and does not contribute to the loss. A negative value means that no label 
+            value needs to be ignored. Only valid when soft_label = False.  
+            Default is ``-100`` .
+
+        - **reduction** (str, optional)
+
+            Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-        ignore_index (int64, optional): Specifies a target value that is ignored
-            and does not contribute to the input gradient. Default is ``-100``.
-        soft_label (bool): indicate whether label is soft. Default False, meaning that
-                the label is hard. If soft_label=True, the label is soft.
-        axis (int, optional): The index of dimension to perform softmax calculations. It 
-                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
-                              is the rank of input :attr:`logits`. Default: -1.
 
+        - **soft_label** (bool, optional)
 
-    Returns:
-        Tensor. The tensor storing the cross_entropy_loss of input and label.
+            Indicate whether label is soft. 
+            If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
+            Default is ``False``.
 
+        - **axis** (int, optional)
+
+            The index of dimension to perform softmax calculations. 
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number 
+            of dimensions of input :attr:`input`. 
+            Default is ``-1`` .
+
+        - **use_softmax** (bool, optional)
+
+            Indicate whether compute softmax before cross_entropy.
+            Default is ``True``.
+
+        - **name** (str, optional)
+
+            The name of the operator. Default is ``None`` .
+            For more information, please refer to :ref:`api_guide_Name` .
+
+
+    Shape:
+
+        - **input** (Tensor)
+
+            Input tensor, the data type is float32, float64. Shape is
+	    :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+
+            Note: 
+
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                output of softmax operator, which will produce incorrect results.
+
+                2. when use_softmax=False, it expects the output of softmax operator.
+ 
+
+        - **label** (Tensor)
+
+            1. If soft_label=False, the shape is 
+            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
+            the data type is int32, int64, float32, float64, where each value is [0, C-1].
+
+            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            and the sum of the labels for each sample should be 1.
+ 
+        - **output** (Tensor)
+
+            Return the softmax cross_entropy loss of ``input`` and ``label``.
+
+            The data type is the same as input.
+
+            If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
+
+            If :attr:`reduction` is ``'none'``:
+
+            1. If soft_label = False, the dimension of return value is the same with ``label`` . 
+
+            2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+
+     Example1(hard labels):
 
-    Examples:
         .. code-block:: python
             
             import paddle
-            import numpy as np
+            paddle.seed(99999)
+            N=100
+            C=200
+            reduction='mean'
+            input =  paddle.rand([N, C], dtype='float64')  
+            label =  paddle.randint(0, C, shape=[N], dtype='int64')
+            weight = paddle.rand([C], dtype='float64') 
+            
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction=reduction)
+            dy_ret = cross_entropy_loss(
+                                       input,
+                                       label)
+            print(dy_ret.numpy()) #[5.41993642]
+
+
+    Example2(soft labels):
+
+        .. code-block:: python
+            
+            import paddle
+            paddle.seed(99999)
+            axis = -1
+            ignore_index = -100
+            N = 4
+            C = 3
+            shape = [N, C]
+            reduction='mean'
+            weight = None
+            logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
+            labels /= paddle.sum(labels, axis=axis, keepdim=True)
+            paddle_loss_mean = paddle.nn.functional.cross_entropy(
+                                                                  logits,  
+                                                                  labels, 
+                                                                  soft_label=True, 
+                                                                  axis=axis,
+                                                                  weight=weight,
+                                                                  reduction=reduction)
+            print(paddle_loss_mean.numpy()) #[1.12908343]
 
-            input_data = paddle.uniform([5, 100], dtype="float64")
-            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
-            weight_data = np.random.random([100]).astype("float64")
-            input =  paddle.to_tensor(input_data)
-            label =  paddle.to_tensor(label_data)
-            weight = paddle.to_tensor(weight_data)
-            ce_loss = paddle.nn.CrossEntropyLoss(weight=weight, reduction='mean')
-            output = ce_loss(input, label)
-            print(output)
-            # [4.84496039]
     """
 
     def __init__(self,
@@ -229,6 +381,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
                  reduction='mean',
                  soft_label=False,
                  axis=-1,
+                 use_softmax=True,
                  name=None):
         super(CrossEntropyLoss, self).__init__()
         self.weight = weight
@@ -236,6 +389,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
         self.ignore_index = ignore_index
         self.soft_label = soft_label
         self.axis = axis
+        self.use_softmax = use_softmax
         self.name = name
 
     def forward(self, input, label):
@@ -247,6 +401,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
             reduction=self.reduction,
             soft_label=self.soft_label,
             axis=self.axis,
+            use_softmax=self.use_softmax,
             name=self.name)
 
         return ret
@@ -893,6 +1048,7 @@ class CTCLoss(fluid.dygraph.Layer):
         labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
         input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
         label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+        norm_by_times (bool, default false) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
 
     Returns:
         Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
@@ -955,10 +1111,20 @@ class CTCLoss(fluid.dygraph.Layer):
         self.blank = blank
         self.reduction = reduction
 
-    def forward(self, log_probs, labels, input_lengths, label_lengths):
-        return paddle.nn.functional.ctc_loss(log_probs, labels, input_lengths,
-                                             label_lengths, self.blank,
-                                             self.reduction)
+    def forward(self,
+                log_probs,
+                labels,
+                input_lengths,
+                label_lengths,
+                norm_by_times=False):
+        return paddle.nn.functional.ctc_loss(
+            log_probs,
+            labels,
+            input_lengths,
+            label_lengths,
+            self.blank,
+            self.reduction,
+            norm_by_times=norm_by_times)
 
 
 class SmoothL1Loss(fluid.dygraph.Layer):
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index a1cc41f39120ca9918c470488ff90ae443cd92c1..45640a6598e576c1264ed47f9a1e0a540d2d6abe 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -28,13 +28,10 @@
 # TODO: define normalization api  
 
 import six
-#from ...fluid.dygraph.nn import InstanceNorm
 
-from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
-#from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
+from ...fluid.dygraph import BatchNorm  # noqa: F401
 
-#from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
+from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
 from ...fluid.dygraph import layers
 from ...framework import get_default_dtype, set_default_dtype
@@ -53,11 +50,7 @@ import warnings
 from ...fluid.dygraph.base import no_grad
 from .. import functional as F
 
-__all__ = [
-    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'BatchNorm1D',
-    'BatchNorm2D', 'BatchNorm3D', 'InstanceNorm1D', 'InstanceNorm2D',
-    'InstanceNorm3D', 'SyncBatchNorm', 'LocalResponseNorm'
-]
+__all__ = []
 
 
 class _InstanceNormBase(layers.Layer):
@@ -745,6 +738,19 @@ class BatchNorm1D(_BatchNormBase):
           print(batch_norm_out)
     """
 
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCL',
+                 use_global_stats=None,
+                 name=None):
+        super(BatchNorm1D,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, use_global_stats, name)
+
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NC' or input == 'NCL':
             self._data_format = 'NCHW'
@@ -924,6 +930,19 @@ class BatchNorm3D(_BatchNormBase):
           print(batch_norm_out)
     """
 
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCDHW',
+                 use_global_stats=None,
+                 name=None):
+        super(BatchNorm3D,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, use_global_stats, name)
+
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NCDHW':
             self._data_format = 'NCHW'
@@ -1036,7 +1055,7 @@ class SyncBatchNorm(_BatchNormBase):
                  name=None):
         super(SyncBatchNorm,
               self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, name)
+                             bias_attr, data_format, None, name)
 
     def forward(self, x):
         # create output
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 0f3c4449a3f20de271c61be68c0c78b39b19f676..528572ee21b7cc0859c0488bc791239418a4c9f8 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -16,24 +16,11 @@ from ...fluid.dygraph import layers
 from ...fluid.layer_helper import LayerHelper
 from .. import functional as F
 
-__all__ = [
-    'AvgPool1D',
-    'AvgPool2D',
-    'AvgPool3D',
-    'MaxPool1D',
-    'MaxPool2D',
-    'MaxPool3D',
-    'AdaptiveAvgPool1D',
-    'AdaptiveAvgPool2D',
-    'AdaptiveAvgPool3D',
-    'AdaptiveMaxPool1D',
-    'AdaptiveMaxPool2D',
-    'AdaptiveMaxPool3D',
-]
+__all__ = []
 
 
 class AvgPool1D(layers.Layer):
-    """
+    r"""
     This operation applies a 1D average pooling over an input signal composed
     of several input planes, based on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
@@ -41,36 +28,33 @@ class AvgPool1D(layers.Layer):
     The output tensor shape will be [N, C, output_size].
 
     The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    output (N, C, :math:`L_{out}`) and kernel_size ksize can be precisely described as
     For average pool1d:
 
     ..  math::
 
-       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
+        Output(N_i, C_i, l) = \frac{Input[N_i, C_i, stride \times l:stride \times l+k]}{ksize}
 
-
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain an integer.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain an integer.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer. Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is `True`.
-        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. The default value is False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        exclusive(bool, optional): Whether to exclude padding points in average pooling mode, default is `True`.
+        ceil_mode(bool, optional): ${ceil_mode_comment}Whether to use the ceil function to calculate output height
+            and width. If it is set to False, the floor function will be used. The default value is False.
+        name(str, optional): For eed to detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no nset and None by default.
 
     Returns:
-        None.
+        A callable object of AvgPool1D.
 
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -79,23 +63,24 @@ class AvgPool1D(layers.Layer):
         ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
 
-
     Shape:
-        - inpuut: 3-D tensor.
-        - output: 3-D tensor
+        - x(Tensor): The input tensor of avg pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of avg pool1d  operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
 
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
-          pool_out = AvgPool1D(data)
-          # pool_out shape: [1, 3, 16]
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
+            pool_out = AvgPool1D(data)
+            # pool_out shape: [1, 3, 16]
 
     """
 
@@ -132,49 +117,53 @@ class AvgPool2D(layers.Layer):
     H is the height of the feature, and W is the width of the feature.
 
     Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
-           $$
-
-    Args:
-       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        Input:
+            X shape: :math:`(N, C, :math:`H_{in}`, :math:`W_{in}`)`
+        Attr:
+            kernel_size: ksize
+
+        Output:
+            Out shape: :math:`(N, C, :math:`H_{out}`, :math:`W_{out}`)`
+
+        ..  math::
+
+            Output(N_i, C_j, h, w)  = \frac{\sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)}{ksize[0] * ksize[1]}
+
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
             4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        ceil_mode(bool, optional): When True, will use `ceil` instead of `floor` to compute the output shape.
+        exclusive(bool, optional): Whether to exclude padding points in average pooling
+            mode, default is `true`.
+        divisor_override(float, optional): If specified, it will be used as divisor, otherwise kernel_size will be
+            used. Default None.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`,
+            `"NDHW"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
     Shape:
-        - x: 4-D tensor.
-        - out: 2-D tensor
+        - x(Tensor): The input tensor of avg pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of avg pool2d  operator, which is a 4-D tensor.
+          The data type is same as input x.
 
-    Returns: None.
+    Returns:
+        A callable object of AvgPool2D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
@@ -182,16 +171,16 @@ class AvgPool2D(layers.Layer):
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          AvgPool2D = nn.AvgPool2D(kernel_size=2,
+            # max pool2d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+            AvgPool2D = nn.AvgPool2D(kernel_size=2,
                                 stride=2, padding=0)
-          output = AvgPool2D(input)
-          # output.shape [1, 3, 16, 16]
+            output = AvgPool2D(input)
+            # output.shape [1, 3, 16, 16]
 
     """
 
@@ -238,61 +227,64 @@ class AvgPool3D(layers.Layer):
     in NCDHW format, where N is batch size, C is the number of channels,
     H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
             4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): ${ceil_mode_comment}
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        ceil_mode(bool, optional): ${ceil_mode_comment}
+        exclusive(bool, optional): Whether to exclude padding points in average pooling mode, default is True.
+        divisor_override(int|float, optional): if specified, it will be used as divisor, otherwise kernel_size will
+            be used. Default None.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCDHW"`,
+             `"NDHWC"`. The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+             `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+             to :ref:`api_guide_Name`. Usually name is no need to set and
+             None by default.
 
-    Returns: None.
+    Returns:
+        A callable object of AvgPool3D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
-        - x: 5-D tensor.
-        - out: 5-D tensor.
-
+        - x(Tensor): The input tensor of avg pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
+          The data type is same as input x.
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # avg pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          AvgPool3D = nn.AvgPool3D(kernel_size=2,
+            # avg pool3d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+            AvgPool3D = nn.AvgPool3D(kernel_size=2,
                                    stride=2, padding=0)
-          output = AvgPool3D(input)
-          # output.shape [1, 2, 3, 16, 16]
+            output = AvgPool3D(input)
+            # output.shape [1, 2, 3, 16, 16]
 
     """
 
     def __init__(self,
                  kernel_size,
-                 stride,
+                 stride=None,
                  padding=0,
                  ceil_mode=False,
                  exclusive=True,
@@ -328,10 +320,11 @@ class AvgPool3D(layers.Layer):
 
 class MaxPool1D(layers.Layer):
     """
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_mask parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
+    This operation applies 1D max pooling over input signal
+    composed of several input planes based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCL format, where N is batch size, C is the number of channels,
+    L is the length of the feature.
 
     The output value of the layer with input size (N, C, L),
     output (N, C, L_{out}) and kernel_size k can be precisely described as
@@ -339,28 +332,27 @@ class MaxPool1D(layers.Layer):
 
     ..  math::
 
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+        Output(N_i, C_i, l) =  max(Input[N_i, C_i, stride \times l:stride \times l+k])
 
-    Args:
-       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain an integer.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain an integer.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer. Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
-            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
-            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            4. A list[int] or tuple(int) whose length is 2, It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or(0,0).
             The default value is 0.
-        return_mask (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        return_mask(bool, optional): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode(bool, optional): Whether to use the ceil function to calculate output height and width.
+            False is the default. If it is set to False, the floor function will be used. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Returns:
-        None.
+        A callable object of MaxPool1D.
 
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -371,25 +363,27 @@ class MaxPool1D(layers.Layer):
 
 
     Shape:
-        - x: 3-D tensor.
-        - out: 3-D tensor.
+        - x(Tensor): The input tensor of max pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool1d  operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
 
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
-          pool_out = MaxPool1D(data)
-          # pool_out shape: [1, 3, 16]
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
+            pool_out = MaxPool1D(data)
+            # pool_out shape: [1, 3, 16]
 
-          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
-          pool_out, indices = MaxPool1D(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+            MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            pool_out, indices = MaxPool1D(data)
+            # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
@@ -426,70 +420,73 @@ class MaxPool2D(layers.Layer):
     H is the height of the feature, and W is the width of the feature.
 
     Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
-
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        - Input:
+            X shape: :math:`(N, C, H_{in}, W_{in})`
+        - Attr:
+            kernel_size: ksize
+
+        - Output:
+            Out shape: :math:`(N, C, H_{out}, W_{out})`
+
+        ..  math::
+
+            Output(N_i, C_j, h, w) = \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1}
+                Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
-            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            4. A list[int] or tuple(int) whose length is \4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_mask (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        ceil_mode(bool, optional): when True, will use `ceil` instead of `floor` to compute the output shape
+        return_mask(bool, optional): Whether to return the max indices along with the outputs.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
-    Returns: None
+    Returns:
+        A callable object of MaxPool2D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
-        - x: 4-D tensor.
-        - out: 4-D tensor.
+        - x(Tensor): The input tensor of max pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool2d  operator, which is a 4-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          MaxPool2D = nn.MaxPool2D(kernel_size=2,
+            # max pool2d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+            MaxPool2D = nn.MaxPool2D(kernel_size=2,
                                    stride=2, padding=0)
-          output = MaxPool2D(input)
-          # output.shape [1, 3, 16, 16]
+            output = MaxPool2D(input)
+            # output.shape [1, 3, 16, 16]
 
-          # for return_mask=True
-          MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
-          output, max_indices = MaxPool2D(input)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+            # for return_mask=True
+            MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            output, max_indices = MaxPool2D(input)
+            # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
 
     def __init__(self,
@@ -532,65 +529,68 @@ class MaxPool3D(layers.Layer):
     in NCDHW format, where N is batch size, C is the number of channels,
     H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
-            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            4. A list[int] or tuple(int) whose length is \6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): ${ceil_mode_comment}
-        return_mask (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        ceil_mode(bool, optional): ${ceil_mode_comment}
+        return_mask(bool, optional): Whether to return the max indices along with the outputs.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
 
-    Returns:None.
+    Returns:
+        A callable object of MaxPool3D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
-        - x: 5-D tensor.
-        - out: 5-D tensor.
+        - x(Tensor): The input tensor of max pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool3d  operator, which is a 5-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # max pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          MaxPool3D = nn.MaxPool3D(kernel_size=2,
+            # max pool3d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+            MaxPool3D = nn.MaxPool3D(kernel_size=2,
                                    stride=2, padding=0)
-          output = MaxPool3D(input)
-          # output.shape [1, 2, 3, 16, 16]
+            output = MaxPool3D(input)
+            # output.shape [1, 2, 3, 16, 16]
 
-          # for return_mask=True
-          MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
-          output, max_indices = MaxPool3D(input)
-          # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
+            # for return_mask=True
+            MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            output, max_indices = MaxPool3D(input)
+            # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
     """
 
     def __init__(self,
                  kernel_size,
-                 stride,
-                 padding,
+                 stride=None,
+                 padding=0,
                  return_mask=False,
                  ceil_mode=False,
                  data_format="NCDHW",
@@ -633,51 +633,52 @@ class AdaptiveAvgPool1D(layers.Layer):
 
     ..  math::
 
-       lstart &= floor(i * L_{in} / L_{out})
+        lstart &= floor(i * L_{in} / L_{out})
 
-       lend &= ceil((i + 1) * L_{in} / L_{out})
+        lend &= ceil((i + 1) * L_{in} / L_{out})
 
-       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+        Output(i) &= \frac{ \sum Input[lstart:lend]}{lend - lstart}
 
-    Args:
-        output_size (int): The target output size. It must be an integer.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+    Parameters:
+        output_size(int): The target output size. It must be an integer.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
     Returns:
-        None.
+        A callable object of AdaptiveAvgPool1D.
 
     Raises:
         ValueError: 'output_size' should be an integer.
 
     Shape:
-        - x: 3-D tensor.
-        - out: 3-D tensor.
+        - x(Tensor): 3-D tensor. The input tensor of adaptive avg pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): 3-D tensor. The output tensor of adaptive avg pool1d operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          # average adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-          #
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
-          pool_out = AdaptiveAvgPool1D(data)
-          # pool_out shape: [1, 3, 16]
+            # average adaptive pool1d
+            # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            # output shape is [N, C, m], adaptive pool divide L dimension
+            # of input data into m grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         lstart = floor(i * L / m)
+            #         lend = ceil((i + 1) * L / m)
+            #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lend - lstart)
+            #
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
+            pool_out = AdaptiveAvgPool1D(data)
+            # pool_out shape: [1, 3, 16]
     """
 
     def __init__(self, output_size, name=None):
@@ -702,31 +703,32 @@ class AdaptiveAvgPool2D(layers.Layer):
 
     ..  math::
 
-       hstart &= floor(i * H_{in} / H_{out})
+        hstart &= floor(i * H_{in} / H_{out})
 
-       hend &= ceil((i + 1) * H_{in} / H_{out})
+        hend &= ceil((i + 1) * H_{in} / H_{out})
 
-       wstart &= floor(j * W_{in} / W_{out})
+        wstart &= floor(j * W_{in} / W_{out})
 
-       wend &= ceil((j + 1) * W_{in} / W_{out})
+        wend &= ceil((j + 1) * W_{in} / W_{out})
 
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+        Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
 
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two element, (H, W). H and W can be either a int, or None which means
             the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format(str, optional): The data format of the input and output data. An optional string
             from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
             the order of: [batch_size, input_channels, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
     Shape:
-        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor.
+          The data type is same as input x.
 
     Returns:
         A callable object of AdaptiveAvgPool2D.
@@ -787,34 +789,36 @@ class AdaptiveAvgPool3D(layers.Layer):
 
     ..  math::
 
-      dstart &= floor(i * D_{in} / D_{out})
+        dstart &= floor(i * D_{in} / D_{out})
 
-      dend &= ceil((i + 1) * D_{in} / D_{out})
+        dend &= ceil((i + 1) * D_{in} / D_{out})
 
-      hstart &= floor(j * H_{in} / H_{out})
+        hstart &= floor(j * H_{in} / H_{out})
 
-      hend &= ceil((j + 1) * H_{in} / H_{out})
+        hend &= ceil((j + 1) * H_{in} / H_{out})
 
-      wstart &= floor(k * W_{in} / W_{out})
+        wstart &= floor(k * W_{in} / W_{out})
 
-      wend &= ceil((k + 1) * W_{in} / W_{out})
+        wend &= ceil((k + 1) * W_{in} / W_{out})
 
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+        Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
+            {(dend - dstart) * (hend - hstart) * (wend - wstart)}
 
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
             the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format(str, optional): The data format of the input and output data. An optional string
             from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
             the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Shape:
-        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64\.
+        - output(Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor.
+          The data type is same as input x.
 
     Returns:
         A callable object of AdaptiveAvgPool3D.
@@ -881,58 +885,59 @@ class AdaptiveMaxPool1D(layers.Layer):
 
     ..  math::
 
-       lstart &= floor(i * L_{in} / L_{out})
+        lstart &= floor(i * L_{in} / L_{out})
 
-       lend &= ceil((i + 1) * L_{in} / L_{out})
+        lend &= ceil((i + 1) * L_{in} / L_{out})
 
-       Output(i) &= max(Input[lstart:lend])
+        Output(i) &= max(Input[lstart:lend])
 
-    Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-             it must contain one int.
-        return_mask (bool): If true, the index of max pooling point will be returned along
+    Parameters:
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
+        return_mask(bool, optional): If true, the index of max pooling point will be returned along
             with outputs. It cannot be set in average pooling type. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Returns:
-        None.
+        A callable object of AdaptiveMaxPool1D.
 
     Raises:
         ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
 
     Shape:
-        x (Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          # max adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = max(input[:, :, lstart: lend])
-          #
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
-          pool_out = AdaptiveMaxPool1D(data)
-          # pool_out shape: [1, 3, 16]
-
-          # for return_mask = true
-          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
-          pool_out, indices = AdaptiveMaxPool1D(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+            # max adaptive pool1d
+            # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            # output shape is [N, C, m], adaptive pool divide L dimension
+            # of input data into m grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         lstart = floor(i * L / m)
+            #         lend = ceil((i + 1) * L / m)
+            #         output[:, :, i] = max(input[:, :, lstart: lend])
+            #
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
+            pool_out = AdaptiveMaxPool1D(data)
+            # pool_out shape: [1, 3, 16]
+
+            # for return_mask = true
+            AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
+            pool_out, indices = AdaptiveMaxPool1D(data)
+            # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
@@ -954,31 +959,36 @@ class AdaptiveMaxPool1D(layers.Layer):
 class AdaptiveMaxPool2D(layers.Layer):
     """
     This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and
+    pooling is adaptive one focus on the output size.
 
     For adaptive max pool2d:
 
     ..  math::
 
-       hstart &= floor(i * H_{in} / H_{out})
+        hstart &= floor(i * H_{in} / H_{out})
 
-       hend &= ceil((i + 1) * H_{in} / H_{out})
+        hend &= ceil((i + 1) * H_{in} / H_{out})
 
-       wstart &= floor(j * W_{in} / W_{out})
+        wstart &= floor(j * W_{in} / W_{out})
 
-       wend &= ceil((j + 1) * W_{in} / W_{out})
+        wend &= ceil((j + 1) * W_{in} / W_{out})
 
-       Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
+        Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain
+            two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of
+            the input.
+        return_mask(bool, optional): If true, the index of max pooling point will be returned along with outputs.
+            It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Shape:
-        x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor.
+          The data type is same as input x.
 
     Returns:
         A callable object of AdaptiveMaxPool2D.
@@ -1029,36 +1039,42 @@ class AdaptiveMaxPool2D(layers.Layer):
 
 class AdaptiveMaxPool3D(layers.Layer):
     """
-    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions of the output tensor are
+    determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus
+    on the output size.
 
     For adaptive max pool3d:
 
     ..  math::
 
-      dstart &= floor(i * D_{in} / D_{out})
+        dstart &= floor(i * D_{in} / D_{out})
 
-      dend &= ceil((i + 1) * D_{in} / D_{out})
+        dend &= ceil((i + 1) * D_{in} / D_{out})
 
-      hstart &= floor(j * H_{in} / H_{out})
+        hstart &= floor(j * H_{in} / H_{out})
 
-      hend &= ceil((j + 1) * H_{in} / H_{out})
+        hend &= ceil((j + 1) * H_{in} / H_{out})
 
-      wstart &= floor(k * W_{in} / W_{out})
+        wstart &= floor(k * W_{in} / W_{out})
 
-      wend &= ceil((k + 1) * W_{in} / W_{out})
+        wend &= ceil((k + 1) * W_{in} / W_{out})
 
-      Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+        Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain
+            three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as
+            that of the input.
+        return_mask(bool, optional): If true, the index of max pooling point will be returned along with outputs.
+            Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Shape:
-        x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor.
+          The data type is same as input x.
+
     Returns:
         A callable object of AdaptiveMaxPool3D.
     Examples:
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index a899f18f521e8e7265cba24086da9cb5cc8cc22f..de9b8cdbfce2a1814066c32ccefc8d30e1da851c 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -33,17 +33,7 @@ from paddle.fluid.layers import utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 from paddle.fluid.data_feeder import convert_dtype
 
-__all__ = [
-    'RNNCellBase',
-    'SimpleRNNCell',
-    'LSTMCell',
-    'GRUCell',
-    'RNN',
-    'BiRNN',
-    'SimpleRNN',
-    'LSTM',
-    'GRU',
-]
+__all__ = []
 
 
 def split_states(states, bidirectional=False, state_components=1):
@@ -447,7 +437,7 @@ class LSTMCell(RNNCellBase):
 
     Inputs:
         - **inputs** (Tensor): shape `[batch_size, input_size]`, the input, corresponding to :math:`x_t` in the formula.
-        - **states** (tuple, optional): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
+        - **states** (list|tuple, optional): a list/tuple of two tensors, each of shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
 
     Returns:
         - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
@@ -1053,7 +1043,8 @@ class RNNBase(LayerList):
                 initial_states,
                 paddle.fluid.framework.Variable) else initial_states
 
-        if self.could_use_cudnn:
+        if self.could_use_cudnn and (not fluid.core.is_compiled_with_rocm() or
+                                     sequence_length is None):
             # Add CPU kernel and dispatch in backend later
             return self._cudnn_impl(inputs, initial_states, sequence_length)
 
@@ -1250,7 +1241,7 @@ class LSTM(RNNBase):
 
     Inputs:
         - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, hidden_size]`.
-        - **initial_states** (tuple, optional): the initial state, a tuple of (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
+        - **initial_states** (list|tuple, optional): the initial state, a list/tuple of (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
         - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings.
 
     Returns:
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 5aded4949e2d7a62ed9f63ca5bc89b48202f4c9c..891177532a438993a0291ef99d548af04a651ff6 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -13,14 +13,6 @@
 # limitations under the License.
 
 # TODO: define the classes of Transformer neural network
-__all__ = [
-    'MultiHeadAttention',
-    'TransformerEncoderLayer',
-    'TransformerEncoder',
-    'TransformerDecoderLayer',
-    'TransformerDecoder',
-    'Transformer',
-]
 
 import copy
 import collections
@@ -36,6 +28,8 @@ from ...fluid.dygraph import Layer, LayerList
 from ...fluid.param_attr import ParamAttr
 from ...fluid.data_feeder import convert_dtype
 
+__all__ = []
+
 
 def _convert_param_attr_to_list(param_attr, n):
     """
@@ -461,14 +455,14 @@ class TransformerEncoderLayer(Layer):
             normalization and post-precess includes dropout, residual connection.
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
-        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
+            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
             MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
             Default: None, which means the default weight parameter property is used.
             See usage for details in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
+            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
             MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
             The `False` value means the corresponding layer would not have trainable
@@ -747,16 +741,16 @@ class TransformerDecoderLayer(Layer):
             normalization and post-precess includes dropout, residual connection.
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
-        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
+            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
             self attention, `weight_attr[1]` would be used as `weight_attr` for
             cross attention, and `weight_attr[2]` would be used as `weight_attr`
             for linear in FFN. Otherwise, the three sub-layers all uses it as
             `weight_attr` to create parameters. Default: None, which means the
             default weight parameter property is used. See usage for details
             in :ref:`api_paddle_fluid_param_attr_ParamAttr` . 
-        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
+            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
             self attention, `bias_attr[1]` would be used as `bias_attr` for
             cross attention, and `bias_attr[2]` would be used as `bias_attr`
             for linear in FFN. Otherwise, the three sub-layers all uses it as
@@ -1129,8 +1123,8 @@ class Transformer(Layer):
             normalization and post-precess includes dropout, residual connection.
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
-        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
+        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
+            If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
             `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` 
             would be used as `weight_attr` for cross attention of `TransformerDecoder`, 
             and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. 
@@ -1142,8 +1136,8 @@ class Transformer(Layer):
             Default: None, which means the default weight parameter property is used. 
             See usage for details
             in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
-            If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
+        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
+            If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
             `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
             would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
             and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. 
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index d9c948a848a939c0427c14aee793e2c9c439c47b..e6d3af9a37b329231d625a4542eecea54d943e50 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -17,7 +17,7 @@
 from ...fluid.dygraph import layers
 from .. import functional
 
-__all__ = ['PixelShuffle']
+__all__ = []
 
 
 class PixelShuffle(layers.Layer):
diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py
index 6562ac35e1e3180db671f90188f1304f07864189..b6801cfe3208dd2267bb4caee8776e028859107a 100644
--- a/python/paddle/nn/utils/__init__.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -12,5 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import weight_norm_hook
-from .weight_norm_hook import weight_norm, remove_weight_norm
+from .spectral_norm_hook import spectral_norm
+from .weight_norm_hook import weight_norm, remove_weight_norm  # noqa: F401
+
+__all__ = [  #noqa
+    'weight_norm', 'remove_weight_norm', 'spectral_norm'
+]
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..250eb235fd7d43d96c46ab97c9cbabca16744429
--- /dev/null
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+
+import paddle
+from ..layer.conv import Conv1DTranspose, Conv2DTranspose, Conv3DTranspose
+from ..layer.common import Linear
+from .. import functional as F
+
+__all__ = ['spectral_norm']
+
+
+def normal_(x, mean=0., std=1.):
+    temp_value = paddle.normal(mean, std, shape=x.shape)
+    x.set_value(temp_value)
+    return x
+
+
+class SpectralNorm(object):
+    def __init__(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12):
+        self.name = name
+        self.dim = dim
+        if n_power_iterations <= 0:
+            raise ValueError('Expected n_power_iterations to be positive, but '
+                             'got n_power_iterations={}'.format(
+                                 n_power_iterations))
+        self.n_power_iterations = n_power_iterations
+        self.eps = eps
+
+    def reshape_weight_to_matrix(self, weight):
+        weight_mat = weight
+        if self.dim != 0:
+            # transpose dim to front
+            weight_mat = weight_mat.transpose([self.dim] + [
+                d for d in range(weight_mat.dim()) if d != self.dim
+            ])
+
+        height = weight_mat.shape[0]
+
+        return weight_mat.reshape([height, -1])
+
+    def compute_weight(self, layer, do_power_iteration):
+        weight = getattr(layer, self.name + '_orig')
+        u = getattr(layer, self.name + '_u')
+        v = getattr(layer, self.name + '_v')
+        weight_mat = self.reshape_weight_to_matrix(weight)
+
+        if do_power_iteration:
+            with paddle.no_grad():
+                for _ in range(self.n_power_iterations):
+                    v.set_value(
+                        F.normalize(
+                            paddle.matmul(
+                                weight_mat,
+                                u,
+                                transpose_x=True,
+                                transpose_y=False),
+                            axis=0,
+                            epsilon=self.eps, ))
+
+                    u.set_value(
+                        F.normalize(
+                            paddle.matmul(weight_mat, v),
+                            axis=0,
+                            epsilon=self.eps, ))
+                if self.n_power_iterations > 0:
+                    u = u.clone()
+                    v = v.clone()
+
+        sigma = paddle.dot(u, paddle.mv(weight_mat, v))
+        weight = weight / sigma
+        return weight
+
+    def __call__(self, layer, inputs):
+        setattr(
+            layer,
+            self.name,
+            self.compute_weight(
+                layer, do_power_iteration=layer.training))
+
+    @staticmethod
+    def apply(layer, name, n_power_iterations, dim, eps):
+        for k, hook in layer._forward_pre_hooks.items():
+            if isinstance(hook, SpectralNorm) and hook.name == name:
+                raise RuntimeError("Cannot register two spectral_norm hooks on "
+                                   "the same parameter {}".format(name))
+
+        fn = SpectralNorm(name, n_power_iterations, dim, eps)
+        weight = layer._parameters[name]
+
+        with paddle.no_grad():
+            weight_mat = fn.reshape_weight_to_matrix(weight)
+            h, w = weight_mat.shape
+
+            # randomly initialize u and v
+            u = layer.create_parameter([h])
+            u = normal_(u, 0., 1.)
+            v = layer.create_parameter([w])
+            v = normal_(v, 0., 1.)
+            u = F.normalize(u, axis=0, epsilon=fn.eps)
+            v = F.normalize(v, axis=0, epsilon=fn.eps)
+
+        # delete fn.name form parameters, otherwise you can not set attribute
+        del layer._parameters[fn.name]
+        layer.add_parameter(fn.name + "_orig", weight)
+        # still need to assign weight back as fn.name because all sorts of
+        # things may assume that it exists, e.g., when initializing weights.
+        # However, we can't directly assign as it could be an Parameter and
+        # gets added as a parameter. Instead, we register weight * 1.0 as a plain
+        # attribute.
+        setattr(layer, fn.name, weight * 1.0)
+        layer.register_buffer(fn.name + "_u", u)
+        layer.register_buffer(fn.name + "_v", v)
+        layer.register_forward_pre_hook(fn)
+        return fn
+
+
+def spectral_norm(layer,
+                  name='weight',
+                  n_power_iterations=1,
+                  eps=1e-12,
+                  dim=None):
+    r"""
+    This spectral_norm layer applies spectral normalization to a parameter according to the 
+    following Calculation:
+
+    Step 1:
+    Generate vector U in shape of [H], and V in shape of [W].
+    While H is the :attr:`dim` th dimension of the input weights,
+    and W is the product result of remaining dimensions.
+
+    Step 2:
+    :attr:`n_power_iterations` should be a positive integer, do following
+    calculations with U and V for :attr:`power_iters` rounds.
+
+    .. math::
+
+        \mathbf{v} := \frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
+
+        \mathbf{u} := \frac{\mathbf{W} \mathbf{v}}{\|\mathbf{W} \mathbf{v}\|_2}
+
+    Step 3:
+    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
+
+    .. math::
+
+        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
+
+        \mathbf{W} = \frac{\mathbf{W}}{\sigma(\mathbf{W})}
+
+
+    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
+
+    Parameters:
+        layer(Layer): Layer of paddle, which has weight.
+        name(str, optional): Name of the weight parameter. Default: 'weight'.
+        n_power_iterations(int, optional): The number of power iterations to calculate spectral norm. Default: 1.
+        eps(float, optional): The epsilon for numerical stability in calculating norms. Default: 1e-12.
+        dim(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: None.
+        
+    Returns:
+        The original layer with the spectral norm hook
+
+    Examples:
+       .. code-block:: python
+
+            from paddle.nn import Conv2D
+            from paddle.nn.utils import Spectralnorm
+
+            conv = Conv2D(3, 1, 3)
+            sn_conv = spectral_norm(conv)
+            print(sn_conv)
+            # Conv2D(3, 1, kernel_size=[3, 3], data_format=NCHW)
+            print(sn_conv.weight)
+            # Tensor(shape=[1, 3, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+            #        [[[[-0.21090528,  0.18563725, -0.14127982],
+            #           [-0.02310637,  0.03197737,  0.34353802],
+            #           [-0.17117859,  0.33152047, -0.28408015]],
+            # 
+            #          [[-0.13336606, -0.01862637,  0.06959272],
+            #           [-0.02236020, -0.27091628, -0.24532901],
+            #           [ 0.27254242,  0.15516677,  0.09036587]],
+            # 
+            #          [[ 0.30169338, -0.28146112, -0.11768346],
+            #           [-0.45765871, -0.12504843, -0.17482486],
+            #           [-0.36866254, -0.19969313,  0.08783543]]]])
+
+    """
+
+    if dim is None:
+        if isinstance(layer, (Conv1DTranspose, Conv2DTranspose, Conv3DTranspose,
+                              Linear)):
+            dim = 1
+        else:
+            dim = 0
+    SpectralNorm.apply(layer, name, n_power_iterations, dim, eps)
+    return layer
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index fdf7a1b5bb2e2dc7e5e729a15c76fcbbb32ca12d..8d2cc8062d2ccb408e3407033af924938d3d9aa1 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -19,7 +19,7 @@ from ...fluid import layers as F
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype
 
-__all__ = ['weight_norm', 'remove_weight_norm']
+__all__ = []
 
 
 def l2_norm(x, axis, epsilon=1e-12, name=None):
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index edebfdfcf3710049f851945632c1c09d443a7709..07d2935bc764664c88e42494962af46ad4831191 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -12,19 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
-    'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta',
-    'SGD', 'Momentum', 'Lamb', 'lr'
-]
+from .optimizer import Optimizer  # noqa: F401
+from .adagrad import Adagrad  # noqa: F401
+from .adam import Adam  # noqa: F401
+from .adamw import AdamW  # noqa: F401
+from .adamax import Adamax  # noqa: F401
+from .rmsprop import RMSProp  # noqa: F401
+from .adadelta import Adadelta  # noqa: F401
+from .sgd import SGD  # noqa: F401
+from .momentum import Momentum  # noqa: F401
+from .lamb import Lamb  # noqa: F401
+from . import lr  # noqa: F401
 
-from .optimizer import Optimizer
-from .adagrad import Adagrad
-from .adam import Adam
-from .adamw import AdamW
-from .adamax import Adamax
-from .rmsprop import RMSProp
-from .adadelta import Adadelta
-from .sgd import SGD
-from .momentum import Momentum
-from .lamb import Lamb
-from . import lr
+__all__ = [     #noqa
+           'Optimizer',
+           'Adagrad',
+           'Adam',
+           'AdamW',
+           'Adamax',
+           'RMSProp',
+           'Adadelta',
+           'SGD',
+           'Momentum',
+           'Lamb'
+]
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index e921eda41cfb6a003994a36e8da16892c8150c77..6c10d9bc2690a09b23ed2238ddd548d65f21df36 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -17,7 +17,7 @@ from ..fluid import core
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 
-__all__ = ["Adadelta"]
+__all__ = []
 
 
 class Adadelta(Optimizer):
@@ -38,20 +38,20 @@ class Adadelta(Optimizer):
         E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
 
     Args:
-	learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
             It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
         epsilon (float): a small float number for numeric stability. Default 1.0e-6.
         rho (float): a floating point value indicating the decay rate. Default 0.95.
-        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-        It canbe a float value as coeff of L2 regularization or \
-        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-        the regularization setting here in optimizer will be ignored for this parameter. \
-        Otherwise, the regularization setting here in optimizer will take effect. \
-        Default None, meaning there is no regularization. 
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization. 
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index ec14828e693ee67c03a748622bc1034f1303d80f..bb934e5a9262c778029df3b29d84b6dd7a71bde3 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -17,7 +17,7 @@ from ..fluid import core
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["Adagrad"]
+__all__ = []
 
 
 class Adagrad(Optimizer):
@@ -45,16 +45,16 @@ class Adagrad(Optimizer):
             It can be a float value or a ``Variable`` with a float type.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-06.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_paddle_fluid_param_attr_aramAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_paddle_fluid_param_attr_aramAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
             some derived class of ``GradientClipBase`` . There are three cliping strategies, 
             ClipGradByGlobalNorm, ClipGradByNorm and ClipGradByValue. Default None, 
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index b0c05cf8de76c20ea5a31442b7b8c0935eb39b26..63ca462d1a26b8a17e540a1fac2284b77a523a21 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -24,7 +24,7 @@ from ..fluid.dygraph import base as imperative_base
 
 import paddle
 
-__all__ = ["Adam"]
+__all__ = []
 
 
 class Adam(Optimizer):
@@ -60,18 +60,19 @@ class Adam(Optimizer):
         beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.999.
-        epsilon (float, optional): A small float value for numerical stability.
+        epsilon (float|Tensor, optional): A small float value for numerical stability.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
@@ -146,12 +147,18 @@ class Adam(Optimizer):
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        if not 0 <= beta1 < 1:
-            raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
-        if not 0 <= beta2 < 1:
-            raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
-        if not 0 <= epsilon:
-            raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
+        if not isinstance(beta1, Variable):
+            if not 0 <= beta1 < 1:
+                raise ValueError(
+                    "Invaild value of beta1, expect beta1 in [0,1).")
+        if not isinstance(beta2, Variable):
+            if not 0 <= beta2 < 1:
+                raise ValueError(
+                    "Invaild value of beta2, expect beta2 in [0,1).")
+        if not isinstance(epsilon, Variable):
+            if not 0 <= epsilon:
+                raise ValueError(
+                    "Invaild value of epsilon, expect epsilon >= 0.")
         super(Adam, self).__init__(
             learning_rate=learning_rate,
             parameters=parameters,
@@ -297,7 +304,6 @@ class Adam(Optimizer):
             "Beta2PowOut": [beta2_pow_acc],
         }
         attrs = {
-            "epsilon": self._epsilon,
             "lazy_mode": self._lazy_mode,
             "min_row_size_to_use_multithread": 1000,
             "multi_precision": find_master
@@ -311,6 +317,10 @@ class Adam(Optimizer):
             inputs['Beta2Tensor'] = self._beta2
         else:
             attrs['beta2'] = self._beta2
+        if isinstance(self._epsilon, Variable):
+            inputs['EpsilonTensor'] = self._epsilon
+        else:
+            attrs['epsilon'] = self._epsilon
 
         if find_master:
             inputs["MasterParam"] = master_weight
@@ -351,7 +361,7 @@ class Adam(Optimizer):
         """
         params_grads = []
         for param in self._parameter_list:
-            if not param.trainable:
+            if param.stop_gradient:
                 continue
             if param._grad_ivar() is not None:
                 grad_var = param._grad_ivar()
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index bd65fc19c32aaf5ec75b3bd9d77f7bdefd9fdac8..44ae89f49d1c0502a2f18ca9c4d58f10a6a9a69e 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -17,7 +17,7 @@ from ..fluid import core
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 
-__all__ = ["Adamax"]
+__all__ = []
 
 
 class Adamax(Optimizer):
@@ -55,16 +55,16 @@ class Adamax(Optimizer):
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
             some derived class of ``GradientClipBase`` . There are three cliping strategies 
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
@@ -184,7 +184,7 @@ class Adamax(Optimizer):
         """
         assert isinstance(block, framework.Block)
         for param, grad in parameters_and_grads:
-            if grad is None or param.trainable is False:
+            if grad is None or param.stop_gradient is True:
                 continue
             with param.block.program._optimized_guard(
                 [param, grad]), name_scope('adamax'):
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 78c9fcb83fc249254903362b78301d5b5be288eb..304f0b771826c946b7a28f17959aef7d426174c4 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -19,7 +19,7 @@ from ..fluid import framework
 from ..fluid.dygraph import base as imperative_base
 import paddle
 
-__all__ = ['AdamW']
+__all__ = []
 
 
 class AdamW(Adam):
@@ -45,9 +45,9 @@ class AdamW(Adam):
     Args:
         learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
             It can be a float value or a LRScheduler. The default value is 0.001.
-	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
         beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.9.
@@ -59,7 +59,7 @@ class AdamW(Adam):
         weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
         apply_decay_param_fun (function|None, optional): If it is not None,
             only tensors that makes apply_decay_param_fun(Tensor.name)==True
-            will be updated. It only works when we want to specify tensors.
+            will be updated with weight decay. It only works when we want to specify tensors.
             Default: None.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index a692f59de5b5fe0865af995c834ec2e39b5db457..bff24e71c815366b6d12108436a82edb27d271a7 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -17,7 +17,7 @@ from ..fluid import core
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["Lamb"]
+__all__ = []
 
 
 class Lamb(Optimizer):
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 5085911ce927a319c191b91cd6b48af64a50a05d..7da933a9b72798db2606242c02065b66b333812f 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -17,10 +17,19 @@ import numpy
 import warnings
 from paddle import Tensor
 
-__all__ = [
-    'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay',
-    'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay',
-    'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
+__all__ = [ #noqa
+    'LRScheduler',
+    'NoamDecay',
+    'PiecewiseDecay',
+    'NaturalExpDecay',
+    'InverseTimeDecay',
+    'PolynomialDecay',
+    'LinearWarmup',
+    'ExponentialDecay',
+    'MultiStepDecay',
+    'StepDecay',
+    'LambdaDecay',
+    'ReduceOnPlateau',
     'CosineAnnealingDecay'
 ]
 
@@ -303,8 +312,8 @@ class PiecewiseDecay(LRScheduler):
             learning_rate = 0.1
 
     Args:
-        boundaries(list): A list of steps numbers. The type of element in the list is python int. 
-        values(list): A list of learning rate values that will be picked during different epoch boundaries. 
+        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int. 
+        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries. 
             The type of element in the list is python float.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
@@ -786,9 +795,8 @@ class LinearWarmup(LRScheduler):
                 self.last_epoch) / float(self.warmup_steps) + self.start_lr
         else:
             if isinstance(self.learning_rate, LRScheduler):
-                lr_value = self.learning_rate()
-                self.learning_rate.step()
-                return lr_value
+                self.learning_rate.step(self.last_epoch - self.warmup_steps)
+                return self.learning_rate()
 
             return self.learning_rate
 
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 111b2720c86687f141169c963ec69f48cfe01df2..372143553e0c39988f5d0456125ee91bb94d3329 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -21,7 +21,8 @@ from ..fluid import unique_name
 from ..fluid import layers
 import paddle.fluid as fluid
 from paddle.fluid.regularizer import L2DecayRegularizer
-__all__ = ["Momentum"]
+
+__all__ = []
 
 
 class Momentum(Optimizer):
@@ -50,16 +51,16 @@ class Momentum(Optimizer):
         learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
             It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
         momentum (float): Momentum factor. The default value is 0.9.
-        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-        It canbe a float value as coeff of L2 regularization or \
-        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-        the regularization setting here in optimizer will be ignored for this parameter. \
-        Otherwise, the regularization setting here in optimizer will take effect. \
-        Default None, meaning there is no regularization.
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 212dad7c77cb4f8ecd88d79187bc4a1c8b40a478..b06bd2a2b0be9539ed33f5c898da7d15f92a09a6 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -42,7 +42,7 @@ from ..fluid.wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
 from .lr import LRScheduler
 
-__all__ = ['Optimizer']
+__all__ = []
 
 
 class Optimizer(object):
@@ -55,7 +55,7 @@ class Optimizer(object):
     Args:
         learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
             It can be a float value or any subclass of ``LRScheduler`` .
-        parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
@@ -100,8 +100,19 @@ class Optimizer(object):
                  weight_decay=None,
                  grad_clip=None,
                  name=None):
-        self._parameter_list = list(
-            parameters) if parameters is not None else None
+        if parameters is not None:
+            # paddle.Tensor is also iterable, so here we don't check whether
+            # the input is iterable, if the input is paddle.Tensor, the
+            # list(paddle.Tensor) will be a error value
+            if isinstance(parameters, paddle.Tensor):
+                raise TypeError(
+                    "`parameters` argument given to the optimizer should be "
+                    "an iterable of paddle Tensors, but got argument type is `{}`.".
+                    format(type(parameters)))
+            self._parameter_list = list(parameters)
+        else:
+            self._parameter_list = None
+
         self._name = name
         if framework.in_dygraph_mode():
             if self._parameter_list is None:
@@ -110,7 +121,8 @@ class Optimizer(object):
                 )
             if weight_decay is not None:
                 for param in self._parameter_list:
-                    if param.regularizer is not None:
+                    if hasattr(param,
+                               'regularizer') and param.regularizer is not None:
                         logging.info(
                             "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
                             "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
@@ -433,17 +445,20 @@ class Optimizer(object):
     def _create_param_lr(self, param_and_grad):
         # create learning rate tensor for every parameter
         param = param_and_grad[0]
-        param_lr = param.optimize_attr['learning_rate']
-        if type(param_lr) == Variable:
-            return param_lr
-        else:
-            if param_lr == 1.0:
-                return self._global_learning_rate()
+        if hasattr(param, 'optimize_attr'):
+            param_lr = param.optimize_attr['learning_rate']
+            if type(param_lr) == Variable:
+                return param_lr
             else:
-                with default_main_program()._lr_schedule_guard(
-                        is_with_opt=True), framework.name_scope(
-                            'scale_with_param_lr'):
-                    return self._global_learning_rate() * param_lr
+                if param_lr == 1.0:
+                    return self._global_learning_rate()
+                else:
+                    with default_main_program()._lr_schedule_guard(
+                            is_with_opt=True), framework.name_scope(
+                                'scale_with_param_lr'):
+                        return self._global_learning_rate() * param_lr
+        else:
+            return self._global_learning_rate()
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -542,7 +557,7 @@ class Optimizer(object):
 
     def _update_param_device_map(self, parameters_and_grads, target_block):
         for param_and_grad in parameters_and_grads:
-            if param_and_grad[0].trainable is True:
+            if param_and_grad[0].stop_gradient is False:
                 param_name = param_and_grad[0].name
                 ops = target_block.ops
                 device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
@@ -598,14 +613,14 @@ class Optimizer(object):
         self._update_param_device_map(parameters_and_grads, target_block)
         self._create_accumulators(
             target_block,
-            [p[0] for p in parameters_and_grads if p[0].trainable])
+            [p[0] for p in parameters_and_grads if not p[0].stop_gradient])
         self._create_global_learning_rate()
 
         if framework.in_dygraph_mode():
             for param_and_grad in parameters_and_grads:
                 if param_and_grad[1] is None:
                     continue
-                if param_and_grad[0].trainable is True:
+                if param_and_grad[0].stop_gradient is False:
                     self._append_optimize_op(target_block, param_and_grad)
         else:
             for param_and_grad in parameters_and_grads:
@@ -613,7 +628,7 @@ class Optimizer(object):
                     continue
                 with param_and_grad[0].block.program._optimized_guard(
                         param_and_grad), name_scope("optimizer"):
-                    if param_and_grad[0].trainable is True:
+                    if param_and_grad[0].stop_gradient is False:
                         device = self._get_device_for_param(param_and_grad[0]
                                                             .name)
                         with device_guard(device):
@@ -689,7 +704,7 @@ class Optimizer(object):
 
             params_grads = []
             for param in parameter_list:
-                if not param.trainable:
+                if param.stop_gradient:
                     continue
                 if param._grad_ivar() is not None:
                     # create gradient tensor
@@ -789,8 +804,9 @@ class Optimizer(object):
     def _get_no_grad_set(self, loss, no_grad_set=None):
         no_grad_set = _get_no_grad_set_name(no_grad_set)
         parameters = loss.block.program.global_block().all_parameters()
-        param_no_trainable = set(
-            [param.name for param in parameters if param.trainable is False])
+        param_no_trainable = set([
+            param.name for param in parameters if param.stop_gradient is True
+        ])
         # If the parameter is no trainable, it should not have a gradient.
         no_grad_set.update(param_no_trainable)
 
@@ -825,7 +841,7 @@ class Optimizer(object):
 
         """
         for p in self._parameter_list:
-            if p.trainable:
+            if not p.stop_gradient:
                 p.clear_gradient()
 
     @imperative_base.no_grad
@@ -920,7 +936,7 @@ class Optimizer(object):
         """
         params_grads = []
         for param in self._parameter_list:
-            if not param.trainable:
+            if param.stop_gradient:
                 continue
             if param._grad_ivar() is not None:
                 grad_var = param._grad_ivar()
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 12825bb781381253186bad500c6009b8b8d1db43..b0bb0228c8ca82acc40b62e1a9074636b4def097 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -17,7 +17,7 @@ from ..fluid import core
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["RMSProp"]
+__all__ = []
 
 
 class RMSProp(Optimizer):
@@ -80,16 +80,16 @@ class RMSProp(Optimizer):
             the gradient; if False, by the uncentered second moment. Setting this to
             True may help with training, but is slightly more expensive in terms of
             computation and memory. Defaults to False.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index b2937ff1620643d9cd2ca9a92d45f2dabdb161d4..4526034b405b0c97f1b06e07f3e4279cdc2d0d95 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -17,7 +17,8 @@ from ..fluid import core
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 from ..fluid.dygraph import no_grad
-__all__ = ["SGD"]
+
+__all__ = []
 
 
 class SGD(Optimizer):
@@ -31,16 +32,16 @@ class SGD(Optimizer):
     Parameters:
         learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
             It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
-        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-        It canbe a float value as coeff of L2 regularization or \
-        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-        the regularization setting here in optimizer will be ignored for this parameter. \
-        Otherwise, the regularization setting here in optimizer will take effect. \
-        Default None, meaning there is no regularization.
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
diff --git a/python/paddle/proto/__init__.py b/python/paddle/proto/__init__.py
index 07406a841ec90a79fbe5d0aca7b19d19d85e008a..f482d80548de139c907ce338f2c55889df04d4b6 100644
--- a/python/paddle/proto/__init__.py
+++ b/python/paddle/proto/__init__.py
@@ -14,3 +14,5 @@
 
 from paddle.proto.TrainerConfig_pb2 import OptimizationConfig, TrainerConfig
 from paddle.proto.ModelConfig_pb2 import ModelConfig
+
+__all__ = []
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 1a4d45469235dac16eaa414b8fc6350bbbb040fa..9002cd0676edaa2d959f673e5c80a061f6a41884 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -63,7 +63,15 @@ An example implementation for multiple item data reader creator:
 
 """
 
-import paddle.reader.decorator
-from paddle.reader.decorator import *
+from paddle.reader.decorator import map_readers  # noqa: F401
+from paddle.reader.decorator import shuffle  # noqa: F401
+from paddle.reader.decorator import xmap_readers  # noqa: F401
+from paddle.reader.decorator import firstn  # noqa: F401
+from paddle.reader.decorator import buffered  # noqa: F401
+from paddle.reader.decorator import compose  # noqa: F401
+from paddle.reader.decorator import cache  # noqa: F401
+from paddle.reader.decorator import ComposeNotAligned  # noqa: F401
+from paddle.reader.decorator import chain  # noqa: F401
+from paddle.reader.decorator import multiprocess_reader  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 8ee4d73ea847ea116ea4401b5b05ef1b925950fe..da9749722e132952e6a77ca82afae4580b427cee 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -12,16 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
-    'cache', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'multiprocess_reader'
-]
-
 from threading import Thread
 import subprocess
 import multiprocessing
 import six
 import sys
+import warnings
 
 from six.moves.queue import Queue
 from six.moves import zip_longest
@@ -30,7 +26,11 @@ from six.moves import zip
 import itertools
 import random
 import zlib
+
 import paddle.compat as cpt
+from paddle.fluid.reader import QUEUE_GET_TIMEOUT
+
+__all__ = []
 
 # On macOS, the 'spawn' start method is now the default in Python3.8 multiprocessing,
 # Paddle is currently unable to solve this, so forces the process to start using 
@@ -587,13 +587,17 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
         raise NotImplementedError(
             "The multiprocess_reader method is not supported on windows.")
 
+    # ujson is ultra fast json encoder and decoder written in pure C with bindings for Python 3.6+.
     try:
         import ujson as json
     except Exception as e:
-        sys.stderr.write("import ujson error: " + str(e) + " use json\n")
+        warnings.warn(
+            "The `ujson` module is not found, use the `json` module, `ujson` encodes and decodes faster, "
+            "you can install `ujson` through `pip install ujson`.")
         import json
 
-    assert type(readers) is list and len(readers) > 0
+    assert isinstance(readers, (list, tuple)) and len(readers) > 0, (
+        "`readers` must be list or tuple.")
 
     def _read_into_queue(reader, queue):
         try:
@@ -616,11 +620,20 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
         reader_num = len(readers)
         finish_num = 0
         while finish_num < reader_num:
-            sample = queue.get()
+            try:
+                sample = queue.get(timeout=QUEUE_GET_TIMEOUT)
+            except:
+                logging.error(
+                    "multiprocess_reader failed to get data from the multiprocessing.Queue."
+                )
+                six.reraise(*sys.exc_info())
+
             if sample is None:
                 finish_num += 1
             elif sample == "":
-                raise ValueError("multiprocess reader raises an exception")
+                raise ValueError(
+                    "multiprocess_reader failed to put data into the multiprocessing.Queue."
+                )
             else:
                 yield sample
 
@@ -662,7 +675,9 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
                 elif sample == "":
                     conn.close()
                     conn_to_remove.append(conn)
-                    raise ValueError("multiprocess reader raises an exception")
+                    raise ValueError(
+                        "multiprocess_reader failed to send data into the multiprocessing.Pipe."
+                    )
                 else:
                     yield sample
 
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index e15702e39c458eeccf2528eed43f80bff6448425..e11600a06fb9e9048e76c8f6e33c67699ef54634 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -19,6 +19,8 @@ import functools
 
 import paddle.reader
 
+__all__ = []
+
 
 def reader_creator_10(dur):
     def reader():
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 91b4a29cefcc1cfc1e89de93c41bd8dcd1246098..89da75ae91e40ed5fcf8d45dfca4e2628cc24f9e 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -12,88 +12,83 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: import framework api under this directory 
-__all__ = [
-    'append_backward',
-    'gradients',
-    'Executor',
-    'global_scope',
-    'scope_guard',
-    'BuildStrategy',
-    'CompiledProgram',
-    'Print',
-    'py_func',
-    'ExecutionStrategy',
-    'name_scope',
-    'ParallelExecutor',
-    'program_guard',
-    'WeightNormParamAttr',
-    'default_main_program',
-    'default_startup_program',
-    'Program',
-    'data',
-    'InputSpec',
-    'save',
-    'load',
-    'save_inference_model',
-    'load_inference_model',
-    'load_program_state',
-    'set_program_state',
-    'cpu_places',
-    'cuda_places',
-    'xpu_places',
-    'Variable',
-    'load_vars',
-    'save_vars',
-    'auc',
-    'accuracy',
-]
+from . import amp  # noqa: F401
+from . import nn  # noqa: F401
+from .io import save_inference_model  # noqa: F401
+from .io import load_inference_model  # noqa: F401
+from .io import deserialize_persistables  # noqa: F401
+from .io import serialize_persistables  # noqa: F401
+from .io import deserialize_program  # noqa: F401
+from .io import serialize_program  # noqa: F401
+from .io import load_from_file  # noqa: F401
+from .io import save_to_file  # noqa: F401
+from .io import normalize_program  # noqa: F401
+from ..fluid import Scope  # noqa: F401
+from .input import data  # noqa: F401
+from .input import InputSpec  # noqa: F401
+from ..fluid.executor import Executor  # noqa: F401
+from ..fluid.executor import global_scope  # noqa: F401
+from ..fluid.executor import scope_guard  # noqa: F401
+from ..fluid.backward import append_backward  # noqa: F401
+from ..fluid.backward import gradients  # noqa: F401
+from ..fluid.compiler import BuildStrategy  # noqa: F401
+from ..fluid.compiler import CompiledProgram  # noqa: F401
+from ..fluid.compiler import ExecutionStrategy  # noqa: F401
+from ..fluid.framework import default_main_program  # noqa: F401
+from ..fluid.framework import default_startup_program  # noqa: F401
+from ..fluid.framework import device_guard  # noqa: F401
+from ..fluid.framework import Program  # noqa: F401
+from ..fluid.framework import name_scope  # noqa: F401
+from ..fluid.framework import program_guard  # noqa: F401
+from ..fluid.framework import cpu_places  # noqa: F401
+from ..fluid.framework import cuda_places  # noqa: F401
+from ..fluid.framework import xpu_places  # noqa: F401
+from ..fluid.framework import Variable  # noqa: F401
+from ..fluid.layers.control_flow import Print  # noqa: F401
+from ..fluid.layers.nn import py_func  # noqa: F401
+from ..fluid.parallel_executor import ParallelExecutor  # noqa: F401
+from ..fluid.param_attr import WeightNormParamAttr  # noqa: F401
+from ..fluid.io import save  # noqa: F401
+from ..fluid.io import load  # noqa: F401
+from ..fluid.io import load_program_state  # noqa: F401
+from ..fluid.io import set_program_state  # noqa: F401
 
-from . import nn
-from . import amp
-from .io import save_inference_model  #DEFINE_ALIAS
-from .io import load_inference_model  #DEFINE_ALIAS
-from .io import deserialize_persistables  #DEFINE_ALIAS
-from .io import serialize_persistables  #DEFINE_ALIAS
-from .io import deserialize_program  #DEFINE_ALIAS
-from .io import serialize_program  #DEFINE_ALIAS
-from .io import load_from_file  #DEFINE_ALIAS
-from .io import save_to_file  #DEFINE_ALIAS
-from .io import normalize_program  #DEFINE_ALIAS
-from ..fluid import Scope  #DEFINE_ALIAS
-from .input import data  #DEFINE_ALIAS
-from .input import InputSpec  #DEFINE_ALIAS
-from ..fluid.executor import Executor  #DEFINE_ALIAS
-from ..fluid.executor import global_scope  #DEFINE_ALIAS
-from ..fluid.executor import scope_guard  #DEFINE_ALIAS
-from ..fluid.backward import append_backward  #DEFINE_ALIAS
-from ..fluid.backward import gradients  #DEFINE_ALIAS
-from ..fluid.compiler import BuildStrategy  #DEFINE_ALIAS
-from ..fluid.compiler import CompiledProgram  #DEFINE_ALIAS
-from ..fluid.compiler import ExecutionStrategy  #DEFINE_ALIAS
-from ..fluid.framework import default_main_program  #DEFINE_ALIAS
-from ..fluid.framework import default_startup_program  #DEFINE_ALIAS
-from ..fluid.framework import device_guard  #DEFINE_ALIAS
-from ..fluid.framework import Program  #DEFINE_ALIAS
-from ..fluid.framework import name_scope  #DEFINE_ALIAS
-from ..fluid.framework import program_guard  #DEFINE_ALIAS
-from ..fluid.framework import cpu_places  #DEFINE_ALIAS
-from ..fluid.framework import cuda_places  #DEFINE_ALIAS
-from ..fluid.framework import xpu_places  #DEFINE_ALIAS
-from ..fluid.framework import Variable  #DEFINE_ALIAS
-from ..fluid.layers.control_flow import Print  #DEFINE_ALIAS
-from ..fluid.layers.nn import py_func  #DEFINE_ALIAS
-from ..fluid.parallel_executor import ParallelExecutor  #DEFINE_ALIAS
-from ..fluid.param_attr import WeightNormParamAttr  #DEFINE_ALIAS
-from ..fluid.io import save  #DEFINE_ALIAS
-from ..fluid.io import load  #DEFINE_ALIAS
-from ..fluid.io import load_program_state  #DEFINE_ALIAS
-from ..fluid.io import set_program_state  #DEFINE_ALIAS
+from ..fluid.io import load_vars  # noqa: F401
+from ..fluid.io import save_vars  # noqa: F401
 
-from ..fluid.io import load_vars  #DEFINE_ALIAS
-from ..fluid.io import save_vars  #DEFINE_ALIAS
+from ..fluid.layers import create_parameter  # noqa: F401
+from ..fluid.layers import create_global_var  # noqa: F401
+from ..fluid.layers.metric_op import auc  # noqa: F401
+from ..fluid.layers.metric_op import accuracy  # noqa: F401
 
-from ..fluid.layers import create_parameter  #DEFINE_ALIAS
-from ..fluid.layers import create_global_var  #DEFINE_ALIAS
-from ..fluid.layers.metric_op import auc  #DEFINE_ALIAS
-from ..fluid.layers.metric_op import accuracy  #DEFINE_ALIAS
+__all__ = [     #noqa
+           'append_backward',
+           'gradients',
+           'Executor',
+           'global_scope',
+           'scope_guard',
+           'BuildStrategy',
+           'CompiledProgram',
+           'Print',
+           'py_func',
+           'ExecutionStrategy',
+           'name_scope',
+           'ParallelExecutor',
+           'program_guard',
+           'WeightNormParamAttr',
+           'default_main_program',
+           'default_startup_program',
+           'Program',
+           'data',
+           'InputSpec',
+           'save',
+           'load',
+           'save_inference_model',
+           'load_inference_model',
+           'load_program_state',
+           'set_program_state',
+           'cpu_places',
+           'cuda_places',
+           'Variable',
+           'create_global_var'
+]
diff --git a/python/paddle/static/amp/__init__.py b/python/paddle/static/amp/__init__.py
index 604c7c3d2b4909214a7d71b58e377e5d74cb8f5b..8ee3225057d0a581b7c0c2c98953d059c7f99e0b 100644
--- a/python/paddle/static/amp/__init__.py
+++ b/python/paddle/static/amp/__init__.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.contrib import mixed_precision
-from ...fluid.contrib.mixed_precision import *
-
-__all__ = mixed_precision.__all__
+from ...fluid.contrib.mixed_precision import decorate  # noqa: F401
+from ...fluid.contrib.mixed_precision import CustomOpLists  # noqa: F401
+from ...fluid.contrib.mixed_precision import AutoMixedPrecisionLists  # noqa: F401
+from ...fluid.contrib.mixed_precision import fp16_guard  # noqa: F401
+from ...fluid.contrib.mixed_precision import cast_model_to_fp16  # noqa: F401
+from ...fluid.contrib.mixed_precision import cast_parameters_to_fp16  # noqa: F401
+from ...fluid.contrib.mixed_precision import bf16  # noqa: F401
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index f05051d3e68281129d01b595f9cf2621666ac41e..f06c45cc369737403025ed264815a98b81acc6da 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -21,7 +21,7 @@ from paddle.fluid.data_feeder import check_type
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
 from paddle.fluid.framework import static_only
 
-__all__ = ['data', 'InputSpec']
+__all__ = []
 
 
 @static_only
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 6bbab6ed672ca1fc7c51987b14bcc02bc6d14c51..58e8ebc481d799955d8a738e4c8a581ccd319679 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -37,17 +37,7 @@ from paddle.fluid.framework import static_only, Parameter
 from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.log_helper import get_logger
 
-__all__ = [
-    'save_inference_model',
-    'load_inference_model',
-    'serialize_program',
-    'serialize_persistables',
-    'save_to_file',
-    'deserialize_program',
-    'deserialize_persistables',
-    'load_from_file',
-    'normalize_program',
-]
+__all__ = []
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index fd84a0a9284ee2d5964a4b85da543745828cd8da..416f6e4f3df06886dbd15c7a427b3620a1957842 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -12,7 +12,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
+from .common import fc  # noqa: F401
+from .common import deform_conv2d  # noqa: F401
+
+from ...fluid.layers import batch_norm  # noqa: F401
+from ...fluid.layers import bilinear_tensor_product  # noqa: F401
+from ...fluid.layers import case  # noqa: F401
+from ...fluid.layers import cond  # noqa: F401
+from ...fluid.layers import conv2d  # noqa: F401
+from ...fluid.layers import conv2d_transpose  # noqa: F401
+from ...fluid.layers import conv3d  # noqa: F401
+from ...fluid.layers import conv3d_transpose  # noqa: F401
+from ...fluid.layers import create_parameter  # noqa: F401
+from ...fluid.layers import crf_decoding  # noqa: F401
+from ...fluid.layers import data_norm  # noqa: F401
+from ...fluid.layers import group_norm  # noqa: F401
+from ...fluid.layers import instance_norm  # noqa: F401
+from ...fluid.layers import layer_norm  # noqa: F401
+from ...fluid.layers import multi_box_head  # noqa: F401
+from ...fluid.layers import nce  # noqa: F401
+from ...fluid.layers import prelu  # noqa: F401
+from ...fluid.layers import py_func  # noqa: F401
+from ...fluid.layers import row_conv  # noqa: F401
+from ...fluid.layers import spectral_norm  # noqa: F401
+from ...fluid.layers import switch_case  # noqa: F401
+from ...fluid.layers import while_loop  # noqa: F401
+
+from ...fluid.input import embedding  # noqa: F401
+from ...fluid.contrib.layers import sparse_embedding  # noqa: F401
+
+from ...fluid.layers.sequence_lod import sequence_conv  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_softmax  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_pool  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_concat  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_first_step  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_last_step  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_slice  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_expand  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_expand_as  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_pad  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_unpad  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_reshape  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_scatter  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_enumerate  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_reverse  # noqa: F401
+
+__all__ = [     #noqa
     'fc',
     'batch_norm',
     'embedding',
@@ -39,33 +84,19 @@ __all__ = [
     'switch_case',
     'while_loop',
     'sparse_embedding',
+    'sequence_conv',
+    'sequence_softmax',
+    'sequence_pool',
+    'sequence_concat',
+    'sequence_first_step',
+    'sequence_last_step',
+    'sequence_slice',
+    'sequence_expand',
+    'sequence_expand_as',
+    'sequence_pad',
+    'sequence_unpad',
+    'sequence_reshape',
+    'sequence_scatter',
+    'sequence_enumerate',
+    'sequence_reverse',
 ]
-
-from .common import fc  #DEFINE_ALIAS
-from .common import deform_conv2d  #DEFINE_ALIAS
-
-from ...fluid.layers import batch_norm  #DEFINE_ALIAS
-from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
-from ...fluid.layers import case  #DEFINE_ALIAS
-from ...fluid.layers import cond  #DEFINE_ALIAS
-from ...fluid.layers import conv2d  #DEFINE_ALIAS
-from ...fluid.layers import conv2d_transpose  #DEFINE_ALIAS
-from ...fluid.layers import conv3d  #DEFINE_ALIAS
-from ...fluid.layers import conv3d_transpose  #DEFINE_ALIAS
-from ...fluid.layers import create_parameter  #DEFINE_ALIAS
-from ...fluid.layers import crf_decoding  #DEFINE_ALIAS
-from ...fluid.layers import data_norm  #DEFINE_ALIAS
-from ...fluid.layers import group_norm  #DEFINE_ALIAS
-from ...fluid.layers import instance_norm  #DEFINE_ALIAS
-from ...fluid.layers import layer_norm  #DEFINE_ALIAS
-from ...fluid.layers import multi_box_head  #DEFINE_ALIAS
-from ...fluid.layers import nce  #DEFINE_ALIAS
-from ...fluid.layers import prelu  #DEFINE_ALIAS
-from ...fluid.layers import py_func  #DEFINE_ALIAS
-from ...fluid.layers import row_conv  #DEFINE_ALIAS
-from ...fluid.layers import spectral_norm  #DEFINE_ALIAS
-from ...fluid.layers import switch_case  #DEFINE_ALIAS
-from ...fluid.layers import while_loop  #DEFINE_ALIAS
-
-from ...fluid.input import embedding  #DEFINE_ALIAS
-from ...fluid.contrib.layers import sparse_embedding  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
old mode 100644
new mode 100755
index 0806d2c29148f77abb2fb9c7ab591154be9efd14..b8133872aa934c17f9f888d3e745454a1808d9c3
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -15,7 +15,7 @@
 import paddle
 from paddle.fluid.framework import static_only
 
-__all__ = ['fc', 'deform_conv2d']
+__all__ = []
 
 
 @static_only
@@ -88,7 +88,7 @@ def fc(x,
         out.shape = (1, 2)
 
     Args:
-        x (Tensor|list of Tensor): A tensor or a list of tensor. The number of dimensions
+        x (Tensor|list[Tensor]|tuple[Tensor]): A tensor or a list/tuple of tensors. The number of dimensions
             of each tensor is at least 2. The data type should be float16, float32 or float64.
         size (int): The number of output units in this layer, which also means the feature
             size of output tensor.
@@ -106,6 +106,7 @@ def fc(x,
         weight_attr (ParamAttr, optional): The attribute for the learnable weight.
             The default value is None, and the weight will be initialized to zero.
             For detailed information, please refer to :attr:`paddle.ParamAttr`.
+            Warning, if x is a list of tensor, weight_attr should also be a list of same length.
         bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. 
             If it is set to False, no bias will be added to the output.
             If it is set to None or one kind of ParamAttr, a bias parameter will
@@ -234,16 +235,16 @@ def deform_conv2d(x,
             deformable convolution v1.
         num_filters(int): The number of filter. It is as same as the output
             image channel.
-        filter_size (int|tuple): The filter size. If filter_size is a tuple,
+        filter_size (int|list|tuple): The filter size. If filter_size is a list/tuple,
             it must contain two integers, (filter_size_H, filter_size_W).
             Otherwise, the filter will be a square.
-        stride (int|tuple, Optional): The stride size. If stride is a tuple, it must
+        stride (int|list|tuple, Optional): The stride size. If stride is a list/tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple, Optional): The padding size. If padding is a tuple, it must
+        padding (int|list|tuple, Optional): The padding size. If padding is a list/tuple, it must
             contain two integers, (padding_H, padding_W). Otherwise, the
             padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple, Optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, Optional): The dilation size. If dilation is a list/tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: dilation = 1.
         groups (int, Optional): The groups number of the deformable conv layer. According to
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 0a75f6fd7babc399e322556c1084739b233e89f0..c8d80fc9bc68cbbff4e270bbab4d8203e663bb2e 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -11,205 +11,339 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 
-#from .math import *
-#from .creation import *
-#from .linalg import *
+from .attribute import rank  # noqa: F401
+from .attribute import shape  # noqa: F401
+from .attribute import real  # noqa: F401
+from .attribute import imag  # noqa: F401
+from .creation import to_tensor  # noqa: F401
+from .creation import diag  # noqa: F401
+from .creation import eye  # noqa: F401
+from .creation import linspace  # noqa: F401
+from .creation import ones  # noqa: F401
+from .creation import ones_like  # noqa: F401
+from .creation import zeros  # noqa: F401
+from .creation import zeros_like  # noqa: F401
+from .creation import arange  # noqa: F401
+from .creation import eye  # noqa: F401
+from .creation import full  # noqa: F401
+from .creation import full_like  # noqa: F401
+from .creation import triu  # noqa: F401
+from .creation import tril  # noqa: F401
+from .creation import meshgrid  # noqa: F401
+from .creation import empty  # noqa: F401
+from .creation import empty_like  # noqa: F401
+from .linalg import matmul  # noqa: F401
+from .linalg import dot  # noqa: F401
+from .linalg import norm  # noqa: F401
+from .linalg import transpose  # noqa: F401
+from .linalg import dist  # noqa: F401
+from .linalg import t  # noqa: F401
+from .linalg import cross  # noqa: F401
+from .linalg import cholesky  # noqa: F401
+from .linalg import bmm  # noqa: F401
+from .linalg import histogram  # noqa: F401
+from .linalg import mv  # noqa: F401
+from .logic import equal  # noqa: F401
+from .logic import greater_equal  # noqa: F401
+from .logic import greater_than  # noqa: F401
+from .logic import is_empty  # noqa: F401
+from .logic import less_equal  # noqa: F401
+from .logic import less_than  # noqa: F401
+from .logic import logical_and  # noqa: F401
+from .logic import logical_not  # noqa: F401
+from .logic import logical_or  # noqa: F401
+from .logic import logical_xor  # noqa: F401
+from .logic import not_equal  # noqa: F401
+from .logic import allclose  # noqa: F401
+from .logic import equal_all  # noqa: F401
+from .logic import is_tensor  # noqa: F401
+from .manipulation import cast  # noqa: F401
+from .manipulation import concat  # noqa: F401
+from .manipulation import expand  # noqa: F401
+from .manipulation import broadcast_to  # noqa: F401
+from .manipulation import expand_as  # noqa: F401
+from .manipulation import tile  # noqa: F401
+from .manipulation import flatten  # noqa: F401
+from .manipulation import flatten_  # noqa: F401
+from .manipulation import gather  # noqa: F401
+from .manipulation import gather_nd  # noqa: F401
+from .manipulation import reshape  # noqa: F401
+from .manipulation import reshape_  # noqa: F401
+from .manipulation import flip as reverse  # noqa: F401
+from .manipulation import scatter  # noqa: F401
+from .manipulation import scatter_  # noqa: F401
+from .manipulation import scatter_nd_add  # noqa: F401
+from .manipulation import scatter_nd  # noqa: F401
+from .manipulation import shard_index  # noqa: F401
+from .manipulation import slice  # noqa: F401
+from .manipulation import split  # noqa: F401
+from .manipulation import squeeze  # noqa: F401
+from .manipulation import squeeze_  # noqa: F401
+from .manipulation import stack  # noqa: F401
+from .manipulation import strided_slice  # noqa: F401
+from .manipulation import transpose  # noqa: F401
+from .manipulation import unique  # noqa: F401
+from .manipulation import unsqueeze  # noqa: F401
+from .manipulation import unsqueeze_  # noqa: F401
+from .manipulation import unstack  # noqa: F401
+from .manipulation import flip  # noqa: F401
+from .manipulation import unbind  # noqa: F401
+from .manipulation import roll  # noqa: F401
+from .manipulation import chunk  # noqa: F401
+from .math import abs  # noqa: F401
+from .math import acos  # noqa: F401
+from .math import asin  # noqa: F401
+from .math import atan  # noqa: F401
+from .math import ceil  # noqa: F401
+from .math import ceil_  # noqa: F401
+from .math import cos  # noqa: F401
+from .math import tan  # noqa: F401
+from .math import cosh  # noqa: F401
+from .math import cumsum  # noqa: F401
+from .math import exp  # noqa: F401
+from .math import exp_  # noqa: F401
+from .math import floor  # noqa: F401
+from .math import floor_  # noqa: F401
+from .math import increment  # noqa: F401
+from .math import log  # noqa: F401
+from .math import multiplex  # noqa: F401
+from .math import pow  # noqa: F401
+from .math import reciprocal  # noqa: F401
+from .math import reciprocal_  # noqa: F401
+from .math import round  # noqa: F401
+from .math import round_  # noqa: F401
+from .math import rsqrt  # noqa: F401
+from .math import rsqrt_  # noqa: F401
+from .math import scale  # noqa: F401
+from .math import scale_  # noqa: F401
+from .math import sign  # noqa: F401
+from .math import sin  # noqa: F401
+from .math import sinh  # noqa: F401
+from .math import sqrt  # noqa: F401
+from .math import sqrt_  # noqa: F401
+from .math import square  # noqa: F401
+from .math import stanh  # noqa: F401
+from .math import sum  # noqa: F401
+from .math import tanh  # noqa: F401
+from .math import tanh_  # noqa: F401
+from .math import add_n  # noqa: F401
+from .math import max  # noqa: F401
+from .math import maximum  # noqa: F401
+from .math import min  # noqa: F401
+from .math import minimum  # noqa: F401
+from .math import mm  # noqa: F401
+from .math import divide  # noqa: F401
+from .math import floor_divide  # noqa: F401
+from .math import remainder  # noqa: F401
+from .math import mod  # noqa: F401
+from .math import floor_mod  # noqa: F401
+from .math import multiply  # noqa: F401
+from .math import add  # noqa: F401
+from .math import add_  # noqa: F401
+from .math import subtract  # noqa: F401
+from .math import subtract_  # noqa: F401
+from .math import atan  # noqa: F401
+from .math import logsumexp  # noqa: F401
+from .math import inverse  # noqa: F401
+from .math import log2  # noqa: F401
+from .math import log10  # noqa: F401
+from .math import log1p  # noqa: F401
+from .math import erf  # noqa: F401
+from .math import addmm  # noqa: F401
+from .math import clip  # noqa: F401
+from .math import clip_  # noqa: F401
+from .math import trace  # noqa: F401
+from .math import kron  # noqa: F401
+from .math import isfinite  # noqa: F401
+from .math import isinf  # noqa: F401
+from .math import isnan  # noqa: F401
+from .math import prod  # noqa: F401
+from .math import all  # noqa: F401
+from .math import any  # noqa: F401
+from .math import broadcast_shape  # noqa: F401
+from .math import conj  # noqa: F401
 
-# TODO: define alias in tensor and framework directory
+from .random import multinomial  # noqa: F401
+from .random import standard_normal  # noqa: F401
+from .random import normal  # noqa: F401
+from .random import uniform  # noqa: F401
+from .random import randn  # noqa: F401
+from .random import rand  # noqa: F401
+from .random import randint  # noqa: F401
+from .random import randperm  # noqa: F401
+from .search import argmax  # noqa: F401
+from .search import argmin  # noqa: F401
+from .search import argsort  # noqa: F401
+from .search import topk  # noqa: F401
+from .search import where  # noqa: F401
+from .search import index_select  # noqa: F401
+from .search import nonzero  # noqa: F401
+from .search import sort  # noqa: F401
+from .search import index_sample  # noqa: F401
+from .search import masked_select  # noqa: F401
+from .stat import mean  # noqa: F401
+from .stat import std  # noqa: F401
+from .stat import var  # noqa: F401
+from .stat import numel  # noqa: F401
+from .stat import median  # noqa: F401
+from .to_string import set_printoptions  # noqa: F401
 
-from .random import randperm
-from .attribute import rank  #DEFINE_ALIAS
-from .attribute import shape  #DEFINE_ALIAS
-from .attribute import real  #DEFINE_ALIAS
-from .attribute import imag  #DEFINE_ALIAS
-from .creation import to_tensor  #DEFINE_ALIAS
-from .creation import diag  #DEFINE_ALIAS
-from .creation import eye  #DEFINE_ALIAS
-# from .creation import fill_constant  #DEFINE_ALIAS
-# from .creation import get_tensor_from_selected_rows        #DEFINE_ALIAS
-from .creation import linspace  #DEFINE_ALIAS
-from .creation import ones  #DEFINE_ALIAS
-from .creation import ones_like  #DEFINE_ALIAS
-from .creation import zeros  #DEFINE_ALIAS
-from .creation import zeros_like  #DEFINE_ALIAS
-from .creation import arange  #DEFINE_ALIAS
-from .creation import eye  #DEFINE_ALIAS
-from .creation import full  #DEFINE_ALIAS
-from .creation import full_like  #DEFINE_ALIAS
-from .creation import triu  #DEFINE_ALIAS
-from .creation import tril  #DEFINE_ALIAS
-from .creation import meshgrid  #DEFINE_ALIAS
-from .creation import empty  #DEFINE_ALIAS
-from .creation import empty_like  #DEFINE_ALIAS
-from .linalg import matmul  #DEFINE_ALIAS
-from .linalg import dot  #DEFINE_ALIAS
-# from .linalg import einsum        #DEFINE_ALIAS
-from .linalg import norm  #DEFINE_ALIAS
-from .linalg import transpose  #DEFINE_ALIAS
-from .linalg import dist  #DEFINE_ALIAS
-from .linalg import t  #DEFINE_ALIAS
-from .linalg import cross  #DEFINE_ALIAS
-from .linalg import cholesky  #DEFINE_ALIAS
-# from .linalg import tensordot        #DEFINE_ALIAS
-from .linalg import bmm  #DEFINE_ALIAS
-from .linalg import histogram  #DEFINE_ALIAS
-from .linalg import mv  #DEFINE_ALIAS
-from .logic import equal  #DEFINE_ALIAS
-from .logic import greater_equal  #DEFINE_ALIAS
-from .logic import greater_than  #DEFINE_ALIAS
-from .logic import is_empty  #DEFINE_ALIAS
-#from .logic import isfinite  #DEFINE_ALIAS
-from .logic import less_equal  #DEFINE_ALIAS
-from .logic import less_than  #DEFINE_ALIAS
-from .logic import logical_and  #DEFINE_ALIAS
-from .logic import logical_not  #DEFINE_ALIAS
-from .logic import logical_or  #DEFINE_ALIAS
-from .logic import logical_xor  #DEFINE_ALIAS
-from .logic import not_equal  #DEFINE_ALIAS
-from .logic import allclose  #DEFINE_ALIAS
-from .logic import equal_all  #DEFINE_ALIAS
-# from .logic import isnan        #DEFINE_ALIAS
-from .logic import is_tensor  #DEFINE_ALIAS
-from .manipulation import cast  #DEFINE_ALIAS
-from .manipulation import concat  #DEFINE_ALIAS
-from .manipulation import expand  #DEFINE_ALIAS
-from .manipulation import broadcast_to  #DEFINE_ALIAS
-from .manipulation import expand_as  #DEFINE_ALIAS
-from .manipulation import tile  #DEFINE_ALIAS
-from .manipulation import flatten  #DEFINE_ALIAS
-from .manipulation import gather  #DEFINE_ALIAS
-from .manipulation import gather_nd  #DEFINE_ALIAS
-from .manipulation import reshape  #DEFINE_ALIAS
-from .manipulation import reshape_  #DEFINE_ALIAS
-from .manipulation import flip as reverse  #DEFINE_ALIAS
-from .manipulation import scatter  #DEFINE_ALIAS
-from .manipulation import scatter_  #DEFINE_ALIAS
-from .manipulation import scatter_nd_add  #DEFINE_ALIAS
-from .manipulation import scatter_nd  #DEFINE_ALIAS
-from .manipulation import shard_index  #DEFINE_ALIAS
-from .manipulation import slice  #DEFINE_ALIAS
-from .manipulation import split  #DEFINE_ALIAS
-from .manipulation import squeeze  #DEFINE_ALIAS
-from .manipulation import squeeze_  #DEFINE_ALIAS
-from .manipulation import stack  #DEFINE_ALIAS
-from .manipulation import strided_slice  #DEFINE_ALIAS
-from .manipulation import transpose  #DEFINE_ALIAS
-from .manipulation import unique  #DEFINE_ALIAS
-from .manipulation import unsqueeze  #DEFINE_ALIAS
-from .manipulation import unsqueeze_  #DEFINE_ALIAS
-from .manipulation import unstack  #DEFINE_ALIAS
-from .manipulation import flip  #DEFINE_ALIAS
-from .manipulation import unbind  #DEFINE_ALIAS
-from .manipulation import roll  #DEFINE_ALIAS
-from .manipulation import chunk  #DEFINE_ALIAS
-from .math import abs  #DEFINE_ALIAS
-from .math import acos  #DEFINE_ALIAS
-from .math import asin  #DEFINE_ALIAS
-from .math import atan  #DEFINE_ALIAS
-from .math import ceil  #DEFINE_ALIAS
-from .math import cos  #DEFINE_ALIAS
-from .math import tan  #DEFINE_ALIAS
-from .math import cosh  #DEFINE_ALIAS
-from .math import cumsum  #DEFINE_ALIAS
-# from .math import elementwise_add  #DEFINE_ALIAS
-# from .math import elementwise_div  #DEFINE_ALIAS
-# from .math import elementwise_floordiv  #DEFINE_ALIAS
-# from .math import elementwise_mul  #DEFINE_ALIAS
-# from .math import elementwise_mod  #DEFINE_ALIAS
-# from .math import elementwise_pow  #DEFINE_ALIAS
-# from .math import elementwise_sub  #DEFINE_ALIAS
-from .math import exp  #DEFINE_ALIAS
-from .math import floor  #DEFINE_ALIAS
-from .math import increment  #DEFINE_ALIAS
-from .math import log  #DEFINE_ALIAS
-from .math import multiplex  #DEFINE_ALIAS
-from .math import pow  #DEFINE_ALIAS
-from .math import reciprocal  #DEFINE_ALIAS
-# from .math import reduce_max  #DEFINE_ALIAS
-# from .math import reduce_min  #DEFINE_ALIAS
-# from .math import reduce_prod  #DEFINE_ALIAS
-# from .math import reduce_sum  #DEFINE_ALIAS
-from .math import round  #DEFINE_ALIAS
-from .math import rsqrt  #DEFINE_ALIAS
-from .math import scale  #DEFINE_ALIAS
-from .math import sign  #DEFINE_ALIAS
-from .math import sin  #DEFINE_ALIAS
-from .math import sinh  #DEFINE_ALIAS
-from .math import sqrt  #DEFINE_ALIAS
-from .math import square  #DEFINE_ALIAS
-from .math import stanh  #DEFINE_ALIAS
-from .math import sum  #DEFINE_ALIAS
-from .math import tanh  #DEFINE_ALIAS
-from .math import tanh_  #DEFINE_ALIAS
-from .math import add_n  #DEFINE_ALIAS
-from .math import max  #DEFINE_ALIAS
-from .math import maximum  #DEFINE_ALIAS
-from .math import min  #DEFINE_ALIAS
-from .math import minimum  #DEFINE_ALIAS
-from .math import mm  #DEFINE_ALIAS
-from .math import divide  #DEFINE_ALIAS
-from .math import floor_divide  #DEFINE_ALIAS
-from .math import remainder  #DEFINE_ALIAS
-from .math import mod  #DEFINE_ALIAS
-from .math import floor_mod  #DEFINE_ALIAS
-from .math import multiply  #DEFINE_ALIAS
-from .math import add  #DEFINE_ALIAS
-from .math import subtract  #DEFINE_ALIAS
-from .math import atan  #DEFINE_ALIAS
-from .math import logsumexp  #DEFINE_ALIAS
-from .math import inverse  #DEFINE_ALIAS
-from .math import log2  #DEFINE_ALIAS
-from .math import log10  #DEFINE_ALIAS
-from .math import log1p  #DEFINE_ALIAS
-from .math import erf  #DEFINE_ALIAS
-from .math import addmm  #DEFINE_ALIAS
-from .math import clip  #DEFINE_ALIAS
-from .math import trace  #DEFINE_ALIAS
-from .math import kron  #DEFINE_ALIAS
-from .math import isfinite  #DEFINE_ALIAS
-from .math import isinf  #DEFINE_ALIAS
-from .math import isnan  #DEFINE_ALIAS
-from .math import prod  #DEFINE_ALIAS
-from .math import all  #DEFINE_ALIAS
-from .math import any  #DEFINE_ALIAS
-from .math import broadcast_shape  #DEFINE_ALIAS
-from .math import conj  #DEFINE_ALIAS
+from .array import array_length  # noqa: F401
+from .array import array_read  # noqa: F401
+from .array import array_write  # noqa: F401
+from .array import create_array  # noqa: F401
 
-from .random import multinomial  #DEFINE_ALIAS
-from .random import standard_normal
-from .random import normal
-from .random import uniform  #DEFINE_ALIAS
-from .random import randn  #DEFINE_ALIAS
-from .random import rand  #DEFINE_ALIAS
-from .random import randint  #DEFINE_ALIAS
-from .random import randperm  #DEFINE_ALIAS
-from .search import argmax  #DEFINE_ALIAS
-from .search import argmin  #DEFINE_ALIAS
-from .search import argsort  #DEFINE_ALIAS
-# from .search import has_inf  #DEFINE_ALIAS
-# from .search import has_nan  #DEFINE_ALIAS
-# from .search import masked_select        #DEFINE_ALIAS
-from .search import topk  #DEFINE_ALIAS
-from .search import where  #DEFINE_ALIAS
-from .search import index_select  #DEFINE_ALIAS
-from .search import nonzero  #DEFINE_ALIAS
-from .search import sort  #DEFINE_ALIAS
-from .search import index_sample  #DEFINE_ALIAS
-from .search import masked_select  #DEFINE_ALIAS
-from .stat import mean  #DEFINE_ALIAS
-# from .stat import reduce_mean  #DEFINE_ALIAS
-from .stat import std  #DEFINE_ALIAS
-from .stat import var  #DEFINE_ALIAS
-from .stat import numel  #DEFINE_ALIAS
-from .stat import median  #DEFINE_ALIAS
-# from .tensor import Tensor        #DEFINE_ALIAS
-# from .tensor import LoDTensor        #DEFINE_ALIAS
-# from .tensor import LoDTensorArray        #DEFINE_ALIAS
-from .to_string import set_printoptions  #DEFINE_ALIAS
-
-from .array import array_length  #DEFINE_ALIAS
-from .array import array_read  #DEFINE_ALIAS
-from .array import array_write  #DEFINE_ALIAS
-from .array import create_array  #DEFINE_ALIAS
+#this list used in math_op_patch.py for _binary_creator_
+tensor_method_func  = [ #noqa
+           'matmul',
+           'dot',
+           'norm',
+           'transpose',
+           'dist',
+           't',
+           'cross',
+           'cholesky',
+           'bmm',
+           'histogram',
+           'mv',
+           'abs',
+           'acos',
+           'all',
+           'any',
+           'asin',
+           'atan',
+           'ceil',
+           'ceil_',
+           'cos',
+           'cosh',
+           'cumsum',
+           'exp',
+           'exp_',
+           'floor',
+           'floor_',
+           'increment',
+           'log',
+           'log2',
+           'log10',
+           'logsumexp',
+           'mul',
+           'multiplex',
+           'pow',
+           'prod',
+           'reciprocal',
+           'reciprocal_',
+           'round',
+           'round_',
+           'rsqrt',
+           'rsqrt_',
+           'scale',
+           'scale_',
+           'sign',
+           'sin',
+           'sinh',
+           'sqrt',
+           'sqrt_',
+           'square',
+           'stanh',
+           'sum',
+           'tanh',
+           'tanh_',
+           'add_n',
+           'max',
+           'maximum',
+           'min',
+           'minimum',
+           'mm',
+           'divide',
+           'floor_divide',
+           'remainder',
+           'mod',
+           'floor_mod',
+           'multiply',
+           'add',
+           'add_',
+           'subtract',
+           'subtract_',
+           'atan',
+           'logsumexp',
+           'inverse',
+           'log1p',
+           'erf',
+           'addmm',
+           'clip',
+           'clip_',
+           'trace',
+           'kron',
+           'isfinite',
+           'isinf',
+           'isnan',
+           'broadcast_shape',
+           'conj',
+           'equal',
+           'equal_all',
+           'greater_equal',
+           'greater_than',
+           'is_empty',
+           'less_equal',
+           'less_than',
+           'logical_and',
+           'logical_not',
+           'logical_or',
+           'logical_xor',
+           'not_equal',
+           'allclose',
+           'is_tensor',
+           'cast',
+           'concat',
+           'expand',
+           'broadcast_to',
+           'expand_as',
+           'flatten',
+           'flatten_',
+           'gather',
+           'gather_nd',
+           'reshape',
+           'reshape_',
+           'reverse',
+           'scatter',
+           'scatter_',
+           'scatter_nd_add',
+           'scatter_nd',
+           'shard_index',
+           'slice',
+           'split',
+           'chunk',
+           'squeeze',
+           'squeeze_',
+           'stack',
+           'strided_slice',
+           'transpose',
+           'unique',
+           'unsqueeze',
+           'unsqueeze_',
+           'unstack',
+           'flip',
+           'unbind',
+           'roll',
+           'tile',
+           'argmax',
+           'argmin',
+           'argsort',
+           'masked_select',
+           'topk',
+           'where',
+           'index_select',
+           'nonzero',
+           'sort',
+           'index_sample',
+           'mean',
+           'std',
+           'var',
+           'numel',
+           'median',
+           'rank',
+           'shape',
+           'real',
+           'imag'
+]
diff --git a/python/paddle/tensor/array.py b/python/paddle/tensor/array.py
index ee28d47a9a9fd5aa2189161329aeaddd4d30a64b..6c3d5c577e7452c0c654a6b4e239eb688e8b7c2f 100644
--- a/python/paddle/tensor/array.py
+++ b/python/paddle/tensor/array.py
@@ -16,6 +16,8 @@
 
 from ..fluid import layers
 
+__all__ = []
+
 
 def array_length(array):
     """
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 499586b083fc4d5966f7d40a11c155354509136b..131afca0d676dace32f24df992e0296a787ba52a 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -19,10 +19,10 @@ from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype
 
 # TODO: define functions to get tensor attributes  
-from ..fluid.layers import rank  #DEFINE_ALIAS
-from ..fluid.layers import shape  #DEFINE_ALIAS
+from ..fluid.layers import rank  # noqa: F401
+from ..fluid.layers import shape  # noqa: F401
 
-__all__ = ['rank', 'shape', 'real', 'imag']
+__all__ = []
 
 
 def _complex_to_real_dtype(dtype):
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 056a0226723ca1797e1ed8bff99733bba61d84a8..361c0e80f90d7dd27d7e5bfc11813d476aa12697 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -14,6 +14,8 @@
 
 from __future__ import print_function
 import numpy as np
+from paddle.common_ops_import import fill_constant
+from ..fluid.layers import utils
 
 from ..fluid.layers import tensor
 from ..fluid.framework import Variable
@@ -25,31 +27,11 @@ from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.framework import convert_np_dtype_to_dtype_, in_dygraph_mode, _varbase_creator, device_guard, OpProtoHolder
-from paddle.common_ops_import import *
 # TODO: define functions to get create a tensor  
-from ..fluid.layers import linspace  #DEFINE_ALIAS
+from ..fluid.layers import linspace  # noqa: F401
 import paddle
 
-__all__ = [
-    'to_tensor',
-    'diag',
-    #       'get_tensor_from_selected_rows',
-    'linspace',
-    'ones',
-    'ones_like',
-    'zeros',
-    'zeros_like',
-    'arange',
-    'eye',
-    'full',
-    'full_like',
-    'empty',
-    'empty_like',
-    'triu',
-    'tril',
-    'meshgrid',
-    'assign',
-]
+__all__ = []
 
 
 @dygraph_only
@@ -168,7 +150,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
             data = data.astype(default_type)
 
     if dtype and convert_dtype(dtype) != data.dtype:
-        data = data.astype(dtype)
+        data = data.astype(convert_dtype(dtype))
 
     return paddle.Tensor(
         value=data,
@@ -1036,8 +1018,10 @@ def assign(x, output=None):
     The OP copies the :attr:`x` to the :attr:`output`.
  
     Parameters:
-        x (Tensor|numpy.ndarray): A tensor or numpy ndarray, its data type supports
-            float16, float32, float64, int32 and int64.
+        x (Tensor|numpy.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar,
+            or scalar. Its data type supports float16, float32, float64, int32, int64, and bool.
+            Note: the float64 data will be converted to float32 because of current platform protobuf
+            data limitation.
         output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
             be created as :attr:`output`. Default: None.
  
@@ -1058,5 +1042,6 @@ def assign(x, output=None):
           result2 = paddle.assign(data)  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
           result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
-    check_type(x, 'x', (Variable, numpy.ndarray), 'assign')
+    check_type(x, 'x', (Variable, np.ndarray, list, tuple, float, int, bool),
+               'assign')
     return tensor.assign(x, output)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 99f5bf7ba0ad1b973a9a988e36a2a0c46f52ccb9..8aa9c9bd2bd7f924e05fa388fdc75b2a09ce4ee7 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -13,34 +13,21 @@
 # limitations under the License.
 
 import numpy as np
-from paddle.common_ops_import import *
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type
 from ..fluid.framework import in_dygraph_mode, _varbase_creator
 
-from ..fluid.layers import transpose  #DEFINE_ALIAS
-
-__all__ = [
-    'matmul',
-    'dot',
-    #       'einsum',
-    'norm',
-    'transpose',
-    'dist',
-    't',
-    'cross',
-    'cholesky',
-    #       'tensordot',
-    'bmm',
-    'histogram',
-    'mv'
-]
+from ..fluid.layers import transpose  # noqa: F401
+from paddle.common_ops_import import core
+from paddle.common_ops_import import VarDesc
+
+__all__ = []
 
 
 def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     """
-    Applies matrix multiplication to two tensors. `matmul` follows 
-    the complete broadcast rules, 
+    Applies matrix multiplication to two tensors. `matmul` follows
+    the complete broadcast rules,
     and its behavior is consistent with `np.matmul`.
 
     Currently, the input tensors' number of dimensions can be any, `matmul` can be used to
@@ -50,8 +37,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
 
     - If a transpose flag is specified, the last two dimensions of the tensor
-      are transposed. If the tensor is ndim-1 of shape, the transpose is invalid. If the tensor 
-      is ndim-1 of shape :math:`[D]`, then for :math:`x` it is treated as :math:`[1, D]`, whereas 
+      are transposed. If the tensor is ndim-1 of shape, the transpose is invalid. If the tensor
+      is ndim-1 of shape :math:`[D]`, then for :math:`x` it is treated as :math:`[1, D]`, whereas
       for :math:`y` it is the opposite: It is treated as :math:`[D, 1]`.
 
     The multiplication behavior depends on the dimensions of `x` and `y`. Specifically:
@@ -60,22 +47,22 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
 
     - If both tensors are 2-dimensional, the matrix-matrix product is obtained.
 
-    - If the `x` is 1-dimensional and the `y` is 2-dimensional, 
-      a `1` is prepended to its dimension in order to conduct the matrix multiply. 
+    - If the `x` is 1-dimensional and the `y` is 2-dimensional,
+      a `1` is prepended to its dimension in order to conduct the matrix multiply.
       After the matrix multiply, the prepended dimension is removed.
-      
-    - If the `x` is 2-dimensional and `y` is 1-dimensional, 
+
+    - If the `x` is 2-dimensional and `y` is 1-dimensional,
       the matrix-vector product is obtained.
 
-    - If both arguments are at least 1-dimensional and at least one argument 
-      is N-dimensional (where N > 2), then a batched matrix multiply is obtained. 
-      If the first argument is 1-dimensional, a 1 is prepended to its dimension 
-      in order to conduct the batched matrix multiply and removed after. 
-      If the second argument is 1-dimensional, a 1 is appended to its 
-      dimension for the purpose of the batched matrix multiple and removed after. 
-      The non-matrix (exclude the last two dimensions) dimensions are 
-      broadcasted according the broadcast rule. 
-      For example, if input is a (j, 1, n, m) tensor and the other is a (k, m, p) tensor, 
+    - If both arguments are at least 1-dimensional and at least one argument
+      is N-dimensional (where N > 2), then a batched matrix multiply is obtained.
+      If the first argument is 1-dimensional, a 1 is prepended to its dimension
+      in order to conduct the batched matrix multiply and removed after.
+      If the second argument is 1-dimensional, a 1 is appended to its
+      dimension for the purpose of the batched matrix multiple and removed after.
+      The non-matrix (exclude the last two dimensions) dimensions are
+      broadcasted according the broadcast rule.
+      For example, if input is a (j, 1, n, m) tensor and the other is a (k, m, p) tensor,
       out will be a (j, k, n, p) tensor.
 
     Args:
@@ -177,11 +164,17 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     Returns the matrix norm (Frobenius) or vector norm (the 1-norm, the Euclidean
     or 2-norm, and in general the p-norm for p > 0) of a given tensor.
 
+    .. note::
+        This norm API is different from `numpy.linalg.norm`.
+        This api supports high-order input tensors (rank >= 3), and certain axis need to be pointed out to calculate the norm.
+        But `numpy.linalg.norm` only supports 1-D vector or 2-D matrix as input tensor.
+        For p-order matrix norm, this api actually treats matrix as a flattened vector to calculate the vector norm, NOT REAL MATRIX NORM.
+
     Args:
         x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
         p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
-            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm. 
+            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm.
             Default value is `fro`.
         axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
             or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
@@ -198,10 +191,10 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     Returns:
         Tensor: results of norm operation on the specified axis of input tensor,
         it's data type is the same as input's Tensor.
- 
+
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import numpy as np
             shape=[2, 3, 4]
@@ -344,6 +337,10 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
         return reduce_out
 
     def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
+        """
+        NOTE:
+            This function actually treats the matrix as flattened vector to calculate vector norm instead of matrix norm.
+        """
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
@@ -548,10 +545,10 @@ def dist(x, y, p=2):
 def dot(x, y, name=None):
     """
     This operator calculates inner product for vectors.
-   
+
     .. note::
-       Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix 
-       is the batch dimension, which means that the vectors of multiple batches are dotted. 
+       Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix
+       is the batch dimension, which means that the vectors of multiple batches are dotted.
 
     Parameters:
         x(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``
@@ -604,17 +601,17 @@ def dot(x, y, name=None):
 
 def t(input, name=None):
     """
-    Transpose <=2-D tensor. 
-    0-D and 1-D tensors are returned as it is and 2-D tensor is equal to 
+    Transpose <=2-D tensor.
+    0-D and 1-D tensors are returned as it is and 2-D tensor is equal to
     the paddle.transpose function which perm dimensions set 0 and 1.
-    
+
     Args:
         input (Tensor): The input Tensor. It is a N-D (N<=2) Tensor of data types float16, float32, float64, int32.
-        name(str, optional): The default value is None.  Normally there is no need for 
+        name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
     Returns:
         Tensor: A transposed n-D Tensor, with data type being float16, float32, float64, int32, int64.
-    
+
     For Example:
 
         .. code-block:: text
@@ -679,10 +676,10 @@ def t(input, name=None):
 def cross(x, y, axis=None, name=None):
     """
     Computes the cross product between two tensors along an axis.
-    
+
     Inputs must have the same shape, and the length of their axes should be equal to 3.
     If `axis` is not given, it defaults to the first axis found with the length 3.
-    
+
     Args:
         x (Tensor): The first input tensor.
         y (Tensor): The second input tensor.
@@ -691,7 +688,7 @@ def cross(x, y, axis=None, name=None):
 
     Returns:
         Tensor. A Tensor with same data type as `x`.
-        
+
     Examples:
         .. code-block:: python
 
@@ -737,8 +734,8 @@ def cross(x, y, axis=None, name=None):
 def cholesky(x, upper=False, name=None):
     r"""
     Computes the Cholesky decomposition of one symmetric positive-definite
-    matrix or batches of symmetric positive-definite matrice. 
-    
+    matrix or batches of symmetric positive-definite matrice.
+
     If `upper` is `True`, the decomposition has the form :math:`A = U^{T}U` ,
     and the returned matrix :math:`U` is upper-triangular. Otherwise, the
     decomposition has the form  :math:`A = LL^{T}` , and the returned matrix
@@ -755,7 +752,7 @@ def cholesky(x, upper=False, name=None):
     Returns:
         Tensor: A Tensor with same shape and data type as `x`. It represents \
             triangular matrices generated by Cholesky decomposition.
-        
+
     Examples:
         .. code-block:: python
 
@@ -845,7 +842,7 @@ def bmm(x, y, name=None):
 
 def histogram(input, bins=100, min=0, max=0):
     """
-    Computes the histogram of a tensor. The elements are sorted into equal width bins between min and max. 
+    Computes the histogram of a tensor. The elements are sorted into equal width bins between min and max.
     If min and max are both zero, the minimum and maximum values of the data are used.
 
     Args:
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index d5989a1b10c6a4a15a8c1bcbb9ce56f55c057bf7..bdf2c477d865884bb3d10f6dd16a1f45c2284c4e 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -17,33 +17,18 @@ from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from ..fluid.layers.layer_function_generator import templatedoc
 from .. import fluid
 from ..fluid.framework import in_dygraph_mode
-from paddle.common_ops_import import *
 from ..framework import VarBase as Tensor
 
 # TODO: define logic functions of a tensor  
-from ..fluid.layers import is_empty  #DEFINE_ALIAS
-from ..fluid.layers import logical_and  #DEFINE_ALIAS
-from ..fluid.layers import logical_not  #DEFINE_ALIAS
-from ..fluid.layers import logical_or  #DEFINE_ALIAS
-from ..fluid.layers import logical_xor  #DEFINE_ALIAS
-
-__all__ = [
-    'equal',
-    'equal_all',
-    'greater_equal',
-    'greater_than',
-    'is_empty',
-    'less_equal',
-    'less_than',
-    'logical_and',
-    'logical_not',
-    'logical_or',
-    'logical_xor',
-    'not_equal',
-    'allclose',
-    'is_tensor'
-    #       'isnan'
-]
+from ..fluid.layers import is_empty  # noqa: F401
+from ..fluid.layers import logical_and  # noqa: F401
+from ..fluid.layers import logical_not  # noqa: F401
+from ..fluid.layers import logical_or  # noqa: F401
+from ..fluid.layers import logical_xor  # noqa: F401
+
+from paddle.common_ops_import import core
+
+__all__ = []
 
 
 def equal_all(x, y, name=None):
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 9bcda74d116892ee309d415ffaf144062e42a20e..97826f7d5f81d9da8df2c97833f1dcd84a11923a 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -16,64 +16,61 @@ from __future__ import print_function
 
 from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard
+from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard, dygraph_only
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
 import numpy as np
 import six
 # TODO: define functions to manipulate a tensor  
-from ..fluid.layers import cast  #DEFINE_ALIAS
-from ..fluid.layers import slice  #DEFINE_ALIAS
-from ..fluid.layers import transpose  #DEFINE_ALIAS
-from ..fluid.layers import unstack  #DEFINE_ALIAS
+from ..fluid.layers import cast  # noqa: F401
+from ..fluid.layers import slice  # noqa: F401
+from ..fluid.layers import transpose  # noqa: F401
+from ..fluid.layers import unstack  # noqa: F401
 
-from ..fluid.layers import scatter_nd  #DEFINE_ALIAS
-from ..fluid.layers import shard_index  #DEFINE_ALIAS
+from ..fluid.layers import scatter_nd  # noqa: F401
+from ..fluid.layers import shard_index  # noqa: F401
 from ..fluid import layers
+from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 import paddle
-import warnings
-
-__all__ = [
-    'cast',
-    'concat',
-    'expand',
-    'broadcast_to',
-    'expand_as',
-    'flatten',
-    'gather',
-    'gather_nd',
-    'reshape',
-    'reshape_',
-    'reverse',
-    'scatter',
-    'scatter_',
-    'scatter_nd_add',
-    'scatter_nd',
-    'shard_index',
-    'slice',
-    'split',
-    'chunk',
-    'squeeze',
-    'squeeze_',
-    'stack',
-    'strided_slice',
-    'transpose',
-    'unique',
-    'unsqueeze',
-    'unsqueeze_',
-    'unstack',
-    'flip',
-    'unbind',
-    'roll',
-    'tile',
-]
-
-
-def _print_warning_in_static_mode(api_name):
-    warnings.warn(
-        "In static mode, {}_() is the same as {}() and does not perform inplace operation.".
-        format(api_name, api_name))
+
+__all__ = []
+
+
+@dygraph_only
+def tolist(x):
+    """
+    **Notes**:
+        **This API is ONLY available in Dygraph mode**
+
+    This function translate the paddle.Tensor to python list.
+
+    Args:
+        x(Tensor): ``x`` is the Tensor we want to translate to list
+
+    Returns:
+        list: A list that contain the same value of current Tensor.
+
+    Returns type:
+        list: dtype is same as current Tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            t = paddle.to_tensor([0,1,2,3,4])
+            expectlist = t.tolist()
+            print(expectlist)   #[0, 1, 2, 3, 4]
+
+            expectlist = paddle.tolist(t)
+            print(expectlist)   #[0, 1, 2, 3, 4]
+
+    """
+    return x.numpy().tolist()
+
+
+setattr(core.VarBase, 'tolist', tolist)
 
 
 def concat(x, axis=0, name=None):
@@ -131,7 +128,7 @@ def flip(x, axis, name=None):
     Args:
         x (Tensor): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor x
             should be float32, float64, int32, int64, bool.
-        axis (list): The axis(axes) to flip on. Negative indices for indexing from the end are accepted.
+        axis (list|tuple): The axis(axes) to flip on. Negative indices for indexing from the end are accepted.
         name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
 
@@ -212,7 +209,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
 
     Args:
         x (Tensor): A tensor of number of dimentions >= axis. A tensor with data type float32,
-                      float64, int8, int32, int64.
+                      float64, int8, int32, int64, uint8.
         start_axis (int): the start axis to flatten
         stop_axis (int): the stop axis to flatten
         name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
@@ -249,7 +246,8 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
         raise ValueError("The input x should be a Tensor")
 
     check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten')
+        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
+        'flatten')
     helper = LayerHelper('flatten', **locals())
 
     x_dim = len(x.shape)
@@ -285,6 +283,36 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
     return out
 
 
+@inplace_apis_in_dygraph_only
+def flatten_(x, start_axis=0, stop_axis=-1, name=None):
+    """
+    Inplace version of ``flatten`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_flatten`.
+    """
+    if not (isinstance(x, Variable)):
+        raise ValueError("The input x should be a Tensor")
+
+    x_dim = len(x.shape)
+    if not (isinstance(start_axis, int)) or (
+            start_axis > x_dim - 1) or start_axis < -x_dim:
+        raise ValueError(
+            "The start_axis should be a int, and in range [-rank(x), rank(x))")
+    if not (isinstance(stop_axis, int)) or (
+            stop_axis > x_dim - 1) or stop_axis < -x_dim:
+        raise ValueError(
+            "The stop_axis should be a int, and in range [-rank(x), rank(x))")
+    if start_axis < 0:
+        start_axis = start_axis + x_dim
+    if stop_axis < 0:
+        stop_axis = stop_axis + x_dim
+    if start_axis > stop_axis:
+        raise ValueError("The stop_axis should be larger than stat_axis")
+
+    dy_out, _ = core.ops.flatten_contiguous_range_(x, 'start_axis', start_axis,
+                                                   'stop_axis', stop_axis)
+    return dy_out
+
+
 def roll(x, shifts, axis=None, name=None):
     """
     Roll the `x` tensor along the given axis(axes). With specific 'shifts', Elements that 
@@ -543,7 +571,7 @@ def squeeze(x, axis=None, name=None):
 
     Args:
         x (Tensor): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
-        axis (int|list|tuple, optional): An integer or list of integers, indicating the dimensions to be squeezed. Default is None.
+        axis (int|list|tuple, optional): An integer or list/tuple of integers, indicating the dimensions to be squeezed. Default is None.
                           The range of axis is :math:`[-ndim(x), ndim(x))`.
                           If axis is negative, :math:`axis = axis + ndim(x)`.
                           If axis is None, all the dimensions of x of size 1 will be removed.
@@ -578,6 +606,7 @@ def squeeze(x, axis=None, name=None):
     return layers.squeeze(x, axis, name)
 
 
+@inplace_apis_in_dygraph_only
 def squeeze_(x, axis=None, name=None):
     """
     Inplace version of ``squeeze`` API, the output Tensor will be inplaced with input ``x``.
@@ -590,12 +619,8 @@ def squeeze_(x, axis=None, name=None):
     elif isinstance(axis, tuple):
         axis = list(axis)
 
-    if in_dygraph_mode():
-        out, _ = core.ops.squeeze2_(x, 'axes', axis)
-        return out
-
-    _print_warning_in_static_mode("squeeze")
-    return squeeze(x, axis, name)
+    out, _ = core.ops.squeeze2_(x, 'axes', axis)
+    return out
 
 
 def unique(x,
@@ -771,26 +796,23 @@ def unsqueeze(x, axis, name=None):
     return layers.unsqueeze(x, axis, name)
 
 
+@inplace_apis_in_dygraph_only
 def unsqueeze_(x, axis, name=None):
     """
     Inplace version of ``unsqueeze`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_tensor_unsqueeze`.
     """
-    if in_dygraph_mode():
-        if isinstance(axis, int):
-            axis = [axis]
-        elif isinstance(axis, Variable):
-            axis = axis.numpy().tolist()
-        elif isinstance(axis, (list, tuple)):
-            axis = [
-                item.numpy().item(0) if isinstance(item, Variable) else item
-                for item in axis
-            ]
-        out, _ = core.ops.unsqueeze2_(x, 'axes', axis)
-        return out
-
-    _print_warning_in_static_mode("unsqueeze")
-    return unsqueeze(x, axis, name)
+    if isinstance(axis, int):
+        axis = [axis]
+    elif isinstance(axis, Variable):
+        axis = axis.numpy().tolist()
+    elif isinstance(axis, (list, tuple)):
+        axis = [
+            item.numpy().item(0) if isinstance(item, Variable) else item
+            for item in axis
+        ]
+    out, _ = core.ops.unsqueeze2_(x, 'axes', axis)
+    return out
 
 
 def gather(x, index, axis=None, name=None):
@@ -1019,16 +1041,13 @@ def scatter(x, index, updates, overwrite=True, name=None):
     return out
 
 
+@inplace_apis_in_dygraph_only
 def scatter_(x, index, updates, overwrite=True, name=None):
     """
     Inplace version of ``scatter`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_tensor_scatter`.
     """
-    if in_dygraph_mode():
-        return core.ops.scatter_(x, index, updates, 'overwrite', overwrite)
-
-    _print_warning_in_static_mode("scatter")
-    return scatter(x, index, updates, overwrite, name)
+    return core.ops.scatter_(x, index, updates, 'overwrite', overwrite)
 
 
 def scatter_nd_add(x, index, updates, name=None):
@@ -1431,7 +1450,8 @@ def expand(x, shape, name=None):
                     'Elements in shape must be 1-D Tensors or integers.')
 
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand')
+        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand')
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError("When the data type of input 'x' for expand is bool, "
@@ -1550,26 +1570,23 @@ def reshape(x, shape, name=None):
     return paddle.fluid.layers.reshape(x=x, shape=shape, name=name)
 
 
+@inplace_apis_in_dygraph_only
 def reshape_(x, shape, name=None):
     """
     Inplace version of ``reshape`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_tensor_reshape`.
     """
-    if in_dygraph_mode():
-        if isinstance(shape, (list, tuple)):
-            shape = [
-                item.numpy().item(0) if isinstance(item, Variable) else item
-                for item in shape
-            ]
-            out, _ = core.ops.reshape2_(x, None, 'shape', shape)
-            return out
-        elif isinstance(shape, Variable):
-            shape.stop_gradient = True
-            out, _ = core.ops.reshape2_(x, shape)
-            return out
-
-    _print_warning_in_static_mode("reshape")
-    return reshape(x, shape, name)
+    if isinstance(shape, (list, tuple)):
+        shape = [
+            item.numpy().item(0) if isinstance(item, Variable) else item
+            for item in shape
+        ]
+        out, _ = core.ops.reshape2_(x, None, 'shape', shape)
+        return out
+    elif isinstance(shape, Variable):
+        shape.stop_gradient = True
+        out, _ = core.ops.reshape2_(x, shape)
+        return out
 
 
 def gather_nd(x, index, name=None):
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 87efa9ac442b67c23d786995a496f81b646380aa..23addcb7e3f4e354a0331f79e8dd7986adcb8832 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -17,7 +17,12 @@ math functions
 from __future__ import print_function
 import numpy as np
 
-from paddle.common_ops_import import *
+from paddle.common_ops_import import VarDesc
+from paddle.common_ops_import import dygraph_only
+from paddle.common_ops_import import OpProtoHolder
+from paddle.common_ops_import import templatedoc
+from paddle.common_ops_import import dygraph_utils
+
 from paddle.tensor import cast
 import paddle
 from ..fluid import layers
@@ -25,112 +30,43 @@ from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable,
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
-from .manipulation import _print_warning_in_static_mode
+from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 
 # TODO: define math functions
 # yapf: disable
-from ..fluid.layers import abs    #DEFINE_ALIAS
-from ..fluid.layers import acos    #DEFINE_ALIAS
-from ..fluid.layers import asin    #DEFINE_ALIAS
-from ..fluid.layers import ceil    #DEFINE_ALIAS
-from ..fluid.layers import cos    #DEFINE_ALIAS
-from ..fluid.layers import tan    #DEFINE_ALIAS
-from ..fluid.layers import sinh    #DEFINE_ALIAS
-from ..fluid.layers import cosh    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_add    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_div    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_floordiv    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_mod    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_mul    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_pow    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_sub    #DEFINE_ALIAS
-from ..fluid.layers import exp    #DEFINE_ALIAS
-from ..fluid.layers import floor    #DEFINE_ALIAS
-from ..fluid.layers import log    #DEFINE_ALIAS
-from ..fluid.layers import reciprocal    #DEFINE_ALIAS
-# from ..fluid.layers import reduce_max    #DEFINE_ALIAS
-# from ..fluid.layers import reduce_min    #DEFINE_ALIAS
-# from ..fluid.layers import reduce_prod    #DEFINE_ALIAS
-# from ..fluid.layers import reduce_sum    #DEFINE_ALIAS
-from ..fluid.layers import round    #DEFINE_ALIAS
-from ..fluid.layers import rsqrt    #DEFINE_ALIAS
-from ..fluid.layers import scale    #DEFINE_ALIAS
-from ..fluid.layers import square    #DEFINE_ALIAS
-from ..fluid.layers import stanh    #DEFINE_ALIAS
-from ..fluid.layers import atan    #DEFINE_ALIAS
-from ..fluid.layers import erf    #DEFINE_ALIAS
-from ..fluid.layers import sqrt    #DEFINE_ALIAS
-from ..fluid.layers import sin    #DEFINE_ALIAS
-
-from ..fluid.layers import multiplex    #DEFINE_ALIAS
+from ..fluid.layers import abs    # noqa: F401
+from ..fluid.layers import acos    # noqa: F401
+from ..fluid.layers import asin    # noqa: F401
+from ..fluid.layers import ceil    # noqa: F401
+from ..fluid.layers import ceil_    # noqa: F401
+from ..fluid.layers import cos    # noqa: F401
+from ..fluid.layers import tan    # noqa: F401
+from ..fluid.layers import sinh    # noqa: F401
+from ..fluid.layers import cosh    # noqa: F401
+from ..fluid.layers import exp    # noqa: F401
+from ..fluid.layers import exp_    # noqa: F401
+from ..fluid.layers import floor    # noqa: F401
+from ..fluid.layers import floor_    # noqa: F401
+from ..fluid.layers import log    # noqa: F401
+from ..fluid.layers import reciprocal    # noqa: F401
+from ..fluid.layers import reciprocal_    # noqa: F401
+from ..fluid.layers import round    # noqa: F401
+from ..fluid.layers import round_    # noqa: F401
+from ..fluid.layers import rsqrt    # noqa: F401
+from ..fluid.layers import rsqrt_    # noqa: F401
+from ..fluid.layers import scale    # noqa: F401
+from ..fluid.layers import square    # noqa: F401
+from ..fluid.layers import stanh    # noqa: F401
+from ..fluid.layers import atan    # noqa: F401
+from ..fluid.layers import erf    # noqa: F401
+from ..fluid.layers import sqrt    # noqa: F401
+from ..fluid.layers import sqrt_    # noqa: F401
+from ..fluid.layers import sin    # noqa: F401
+
+from ..fluid.layers import multiplex    # noqa: F401
 from ..fluid import layers
 
-
-__all__ = [
-        'abs',
-        'acos',
-        'all',
-        'any',
-        'asin',
-        'atan',
-        'ceil',
-        'cos',
-        'cosh',
-        'cumsum',
-        'exp',
-        'floor',
-        'increment',
-        'log',
-        'log2',
-        'log10',
-        'logsumexp',
-        'mul',
-        'multiplex',
-        'pow',
-        'prod',
-        'reciprocal',
-        'round',
-        'rsqrt',
-        'scale',
-        'sign',
-        'sin',
-        'sinh',
-        'sqrt',
-        'square',
-        'stanh',
-        'sum',
-        'tanh',
-        'tanh_',
-        'add_n',
-        'max',
-        'maximum',
-        'min',
-        'minimum',
-        'mm',
-        'divide',
-        'floor_divide',
-        'remainder',
-        'mod',
-        'floor_mod',
-        'multiply',
-        'add',
-        'subtract',
-        'atan',
-        'logsumexp',
-        'inverse',
-        'log1p',
-        'erf',
-        'addmm',
-        'clip',
-        'trace',
-        'kron',
-        'isfinite',
-        'isinf',
-        'isnan',
-        'broadcast_shape',
-        'conj'
-]
-# yapf: enable.
+__all__ = []
 
 _supported_int_dtype_ = [
     VarDesc.VarType.UINT8,
@@ -145,6 +81,19 @@ _supported_float_dtype_ = [
     VarDesc.VarType.FP64,
 ]
 
+
+@inplace_apis_in_dygraph_only
+def scale_(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
+    """
+    Inplace version of ``scale`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_scale`.
+    """
+    _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
+    return core.ops.scale_(x, 'scale',
+                            float(_scale), 'bias',
+                            float(bias), 'bias_after_scale', bias_after_scale)
+
+
 def pow(x, y, name=None):
     """
     Compute the power of tensor elements. The equation is:
@@ -292,6 +241,24 @@ def add(x, y, name=None):
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
+@inplace_apis_in_dygraph_only
+def add_(x, y, name=None):
+    """
+    Inplace version of ``add`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_add`.
+    """
+    op_type = 'elementwise_add_'
+    axis = -1
+
+    out_shape = broadcast_shape(x.shape, y.shape)
+    if out_shape != x.shape:
+        raise ValueError("The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(out_shape, x.shape))
+
+    out = _elementwise_op_in_dygraph(
+        x, y, axis=axis, op_name=op_type)
+    return out
+
+
 def subtract(x, y, name=None):
     """
     Substract two tensors element-wise. The equation is:
@@ -353,6 +320,24 @@ def subtract(x, y, name=None):
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
+@inplace_apis_in_dygraph_only
+def subtract_(x, y, name=None):
+    """
+    Inplace version of ``subtract`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_subtract`.
+    """
+    axis = -1
+    act = None
+
+    out_shape = broadcast_shape(x.shape, y.shape)
+    if out_shape != x.shape:
+        raise ValueError("The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(out_shape, x.shape))
+
+    out = _elementwise_op_in_dygraph(
+        x, y, axis=axis, act=act, op_name='elementwise_sub_')
+    return out
+
+
 def divide(x, y, name=None):
     """
     Divide two tensors element-wise. The equation is:
@@ -472,8 +457,8 @@ def remainder(x, y, name=None):
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
-mod = remainder  #DEFINE_ALIAS
-floor_mod = remainder  #DEFINE_ALIAS
+mod = remainder  # noqa: F841
+floor_mod = remainder  # noqa: F841
 
 
 def multiply(x, y, name=None):
@@ -825,7 +810,7 @@ def add_n(inputs, name=None):
                               [14, 16, 18]]
 
     Args:
-        inputs (Tensor|list(Tensor)):  A Tensor list. The shape and data type of the list elements should be consistent.
+        inputs (Tensor|list[Tensor]|tuple[Tensor]):  A Tensor or a list/tuple of Tensors. The shape and data type of the list/tuple elements should be consistent.
             Input can be multi-dimensional Tensor, and data types can be: float32, float64, int32, int64.
         name(str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`
@@ -1155,7 +1140,7 @@ def max(x, axis=None, keepdim=False, name=None):
     Args:
         x(Tensor): A tensor, the data type is float32,
             float64, int32, int64.
-        axis(list|int, optional): The axis along which the maximum is computed.
+        axis(int|list|tuple, optional): The axis along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
             `x` and return a Tensor with a single element,
             otherwise must be in the range :math:`[-x.ndim(x), x.ndim(x))`.
@@ -1247,7 +1232,7 @@ def min(x, axis=None, keepdim=False, name=None):
 
     Args:
         x(Tensor): A tensor, the data type is float32, float64, int32, int64.
-        axis(list|int, optional): The axis along which the minimum is computed.
+        axis(int|list|tuple, optional): The axis along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
             `x` and return a Tensor with a single element,
             otherwise must be in the range :math:`[-x.ndim, x.ndim)`.
@@ -1475,10 +1460,10 @@ def clip(x, min=None, max=None, name=None):
         Out = MIN(MAX(x, min), max)
 
     Args:
-        x (Tensor): An N-D Tensor with data type float32 or float64.
-        min (float32|Tensor): The lower bound with type ``float32`` or a ``Tensor``
+        x (Tensor): An N-D Tensor with data type float32, float64, int32 or int64.
+        min (float|int|Tensor): The lower bound with type ``float`` , ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        max (float32|Tensor): The upper bound with type ``float32`` or a ``Tensor``
+        max (float|int|Tensor): The upper bound with type ``float``, ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -1503,16 +1488,24 @@ def clip(x, min=None, max=None, name=None):
             # [[4.5, 6.4]
     """
 
-    fmin = float(np.finfo(np.float32).min)
-    fmax = float(np.finfo(np.float32).max)
+    x_dtype = str(x.dtype)
+    if x_dtype == 'paddle.int32':
+        min_ = np.iinfo(np.int32).min
+        max_ = np.iinfo(np.int32).max - 2**7
+    elif x_dtype == 'paddle.int64':
+        min_ = np.iinfo(np.int64).min
+        max_ = np.iinfo(np.int64).max - 2**39
+    else:
+        min_ = float(np.finfo(np.float32).min)
+        max_ = float(np.finfo(np.float32).max)
 
     if in_dygraph_mode():
         if isinstance(min, Variable):
             min = min.numpy().item(0)
         if isinstance(max, Variable):
             max = max.numpy().item(0)
-        min = fmin if min is None else min
-        max = fmax if max is None else max
+        min = min_ if min is None else min
+        max = max_ if max is None else max
         return core.ops.clip(x, "min", min, "max", max)
 
     if min is not None:
@@ -1526,10 +1519,10 @@ def clip(x, min=None, max=None, name=None):
             check_dtype(max.dtype, 'max', ['float32', 'float64', 'int32'],
                         'clip', '(When the type of max in clip is Variable.)')
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'clip')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'], 'clip')
 
     inputs = {'X': x}
-    attrs = {'min': fmin, 'max': fmax}
+    attrs = {'min': min_, 'max': max_}
 
     if isinstance(min, Variable):
         min.stop_gradient = True
@@ -1552,6 +1545,24 @@ def clip(x, min=None, max=None, name=None):
     return output
 
 
+@inplace_apis_in_dygraph_only
+def clip_(x, min=None, max=None, name=None):
+    """
+    Inplace version of ``clip`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_clip`.
+    """
+    fmin = float(np.finfo(np.float32).min)
+    fmax = float(np.finfo(np.float32).max)
+    if isinstance(min, Variable):
+        min = min.numpy().item(0)
+    if isinstance(max, Variable):
+        max = max.numpy().item(0)
+    min = fmin if min is None else min
+    max = fmax if max is None else max
+    return core.ops.clip_(x, "min", min, "max", max)
+
+
+
 def trace(x, offset=0, axis1=0, axis2=1, name=None):
     """
     **trace**
@@ -1971,16 +1982,14 @@ def tanh(x, name=None):
     helper.append_op(type='tanh', inputs={'X': x}, outputs={'Out': out})
     return out
 
+@inplace_apis_in_dygraph_only
 def tanh_(x, name=None):
     r"""
     Inplace version of ``tanh`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_tensor_tanh`.
     """
-    if in_dygraph_mode():
-        return core.ops.tanh_(x)
+    return core.ops.tanh_(x)
 
-    _print_warning_in_static_mode("tanh")
-    return tanh(x, name)
 
 def increment(x, value=1.0, name=None):
     """
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index ba7ca417382e26258c24af81ad64bb65d32bf83e..69a463454476332044efedfe15cb7e2a890b8445 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -21,17 +21,7 @@ from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtyp
 from ..fluid.layers import utils
 import paddle
 
-__all__ = [
-    'bernoulli',
-    'multinomial',
-    'standard_normal',
-    'normal',
-    'uniform',
-    'randn',
-    'rand',
-    'randint',
-    'randperm',
-]
+__all__ = []
 
 
 def bernoulli(x, name=None):
@@ -135,6 +125,9 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
 
     """
 
+    assert core.is_compiled_with_rocm() == False, (
+        "multinomial op is not supported on ROCM yet.")
+
     if in_dygraph_mode():
         return core.ops.multinomial(x, 'num_samples', num_samples,
                                     'replacement', replacement)
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 95f8fa449bd5ffb3eee71449930d0701a8afc1e7..3d8a75f9277af1e159a74eeb61721948bc874c88 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -16,25 +16,16 @@ import numpy as np
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import core, layers
+from paddle.common_ops_import import in_dygraph_mode
+from paddle.common_ops_import import convert_np_dtype_to_dtype_
+from paddle.common_ops_import import Variable
+from paddle.common_ops_import import VarDesc
 
 # TODO: define searching & indexing functions of a tensor  
 # from ..fluid.layers import has_inf  #DEFINE_ALIAS
 # from ..fluid.layers import has_nan  #DEFINE_ALIAS
 
-__all__ = [
-    'argmax',
-    'argmin',
-    'argsort',
-    'masked_select',
-    'topk',
-    'where',
-    'index_select',
-    'nonzero',
-    'sort',
-    'index_sample',
-]
-
-from paddle.common_ops_import import *
+__all__ = []
 
 
 def argsort(x, axis=-1, descending=False, name=None):
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 9e565d4e5223cdcaa7c06702671b973b29c240e1..8c74360a17d05bda855f338daf9ad6885fa1e2b6 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -14,8 +14,6 @@
 
 # TODO: define statistical functions of a tensor  
 
-__all__ = ['mean', 'std', 'var', 'numel', 'median']
-
 import numpy as np
 from ..fluid.framework import Variable
 from ..fluid.layer_helper import LayerHelper
@@ -25,6 +23,8 @@ from .search import where
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 import paddle
 
+__all__ = []
+
 
 def mean(x, axis=None, keepdim=False, name=None):
     """
diff --git a/python/paddle/tensor/tensor.py b/python/paddle/tensor/tensor.py
index 478e826468167e1c4d90e88b11c1b3336d95219e..ec7b50c63c0862ee8a1b5f3cf96ba9d444ee79d7 100644
--- a/python/paddle/tensor/tensor.py
+++ b/python/paddle/tensor/tensor.py
@@ -13,9 +13,3 @@
 # limitations under the License.
 
 # TODO: define the basic tensor classes 
-
-__all__ = [
-    #       'Tensor',
-    #       'LoDTensor',
-    #       'LoDTensorArray'
-]
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 778a391df605ecf57f86ceac17a521ad50a4d84d..9d07840be6882562ff40c4bdb7ed2f87199feffc 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -17,7 +17,7 @@ import numpy as np
 from paddle.fluid.layers import core
 from paddle.fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 
-__all__ = ['set_printoptions']
+__all__ = []
 
 
 class PrintOptions(object):
@@ -93,6 +93,10 @@ def set_printoptions(precision=None,
 def _to_sumary(var):
     edgeitems = DEFAULT_PRINT_OPTIONS.edgeitems
 
+    # Handle tensor of shape contains 0, like [0, 2], [3, 0, 3]
+    if np.prod(var.shape) == 0:
+        return np.array([])
+
     if len(var.shape) == 0:
         return var
     elif len(var.shape) == 1:
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 9a676b6b7396b342d22d4edf46b207f0848ba2f3..bb572973fdb36efcf13e6caa934cc5081771b444 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -49,3 +49,4 @@ set_tests_properties(test_vision_models PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_uci_housing PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 300)
 set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600) 
+set_tests_properties(test_hapi_hub PROPERTIES TIMEOUT 300) 
diff --git a/python/paddle/tests/dist_hapi_pure_fp16_static.py b/python/paddle/tests/dist_hapi_pure_fp16_static.py
new file mode 100644
index 0000000000000000000000000000000000000000..0174e4f54e3416c174768051c8191304b01d2f2d
--- /dev/null
+++ b/python/paddle/tests/dist_hapi_pure_fp16_static.py
@@ -0,0 +1,60 @@
+# copyright (c) 2021 paddlepaddle authors. all rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import fluid
+
+from paddle import Model
+from paddle.static import InputSpec
+from paddle.nn.layer.loss import CrossEntropyLoss
+from paddle.vision.models import LeNet
+
+
+@unittest.skipIf(not fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestDistTraningWithPureFP16(unittest.TestCase):
+    def test_amp_training_purefp16(self):
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
+        label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
+
+        paddle.enable_static()
+        paddle.set_device('gpu')
+        net = LeNet()
+        amp_level = "O2"
+        inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
+        labels = InputSpec([None, 1], "int64", "y")
+        model = Model(net, inputs, labels)
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001,
+            parameters=model.parameters(),
+            multi_precision=True)
+        amp_configs = {"level": amp_level, "use_fp16_guard": False}
+        model.prepare(
+            optimizer=optim,
+            loss=CrossEntropyLoss(reduction="sum"),
+            amp_configs=amp_configs)
+        model.train_batch([data], [label])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/hubconf.py b/python/paddle/tests/hubconf.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4a853ef2cd98c00e35b825a42e87b12d5dd190
--- /dev/null
+++ b/python/paddle/tests/hubconf.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dependencies = ['paddle']
+
+import paddle
+from test_hapi_hub_model import MM as _MM
+
+
+def MM(out_channels=8, pretrained=False):
+    '''This is a test demo for paddle hub
+    '''
+    return _MM(out_channels)
diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index e84f73188666aa1c90fd978fc2e96ff44f18a48a..abf79fb1e3974ce0c1d9de4efd1df05056ff3821 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.vision.datasets import *
+from paddle.vision.datasets import Cifar10, Cifar100
 
 
 class TestCifar10Train(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_conll05.py b/python/paddle/tests/test_dataset_conll05.py
index e35c04275d20478336da76c9ba47c98960a9ea24..9eb0036718b35516eb651a2937f5c49ac8cca14b 100644
--- a/python/paddle/tests/test_dataset_conll05.py
+++ b/python/paddle/tests/test_dataset_conll05.py
@@ -16,7 +16,7 @@ import os
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import Conll05st
 
 
 class TestConll05st(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_imdb.py b/python/paddle/tests/test_dataset_imdb.py
index 62c75ab232c8db10f99257fdae17191f94726b61..aed8c387409dce30710cfb3b65232310f99f8410 100644
--- a/python/paddle/tests/test_dataset_imdb.py
+++ b/python/paddle/tests/test_dataset_imdb.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import Imdb
 
 
 class TestImdbTrain(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_imikolov.py b/python/paddle/tests/test_dataset_imikolov.py
index f4f0b8e48367725abb4ebe1fe5b0598ed6e749f1..6ffeeda73c362c69d6a614cdf43888f34c05d875 100644
--- a/python/paddle/tests/test_dataset_imikolov.py
+++ b/python/paddle/tests/test_dataset_imikolov.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import Imikolov
 
 
 class TestImikolovTrain(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_movielens.py b/python/paddle/tests/test_dataset_movielens.py
index 3b61fd6f5c7c22bca5114579fdafe46405f77118..e5c6d8376eed970b0016593e874b89dbf8ceb459 100644
--- a/python/paddle/tests/test_dataset_movielens.py
+++ b/python/paddle/tests/test_dataset_movielens.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import Movielens
 
 
 class TestMovielensTrain(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_uci_housing.py b/python/paddle/tests/test_dataset_uci_housing.py
index 623c7d24d09da7501edd6a8d86e60fc3b772d086..bdf960b43368720c5966cf886aa4806b5ee5da71 100644
--- a/python/paddle/tests/test_dataset_uci_housing.py
+++ b/python/paddle/tests/test_dataset_uci_housing.py
@@ -19,7 +19,7 @@ import tempfile
 import shutil
 import cv2
 
-from paddle.text.datasets import *
+from paddle.text.datasets import UCIHousing, WMT14
 
 
 class TestUCIHousingTrain(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_wmt.py b/python/paddle/tests/test_dataset_wmt.py
index b4945cb90f991e907812129f3918ef0137565244..3e63090c9f0fff424ec0a7e0bb0885cc3e434ed5 100644
--- a/python/paddle/tests/test_dataset_wmt.py
+++ b/python/paddle/tests/test_dataset_wmt.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import WMT14, WMT16
 
 
 class TestWMT14Train(unittest.TestCase):
diff --git a/python/paddle/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
index 89fa01cbceb45c4812cbe6363c9ae0af9ed6ae7d..c93bac3ac27e85e729244cf804d70258a8cefa79 100644
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -20,7 +20,7 @@ import shutil
 import cv2
 
 import paddle.vision.transforms as T
-from paddle.vision.datasets import *
+from paddle.vision.datasets import DatasetFolder, ImageFolder, MNIST, FashionMNIST, Flowers
 from paddle.dataset.common import _check_exists_and_download
 
 
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index db5b63c5ae0e29fa6f1274befd277c4e46c3a1b1..16788e4656192e43f17e09464d1d53ab6dda3ce7 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -129,6 +129,9 @@ class TestMultipleGpus(unittest.TestCase):
     def test_hapi_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py')
 
+    def test_hapi_amp_static(self):
+        self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/tests/test_download.py b/python/paddle/tests/test_download.py
index b8af7f6a80e72148a4f793a4de2188d3cc7a8b69..4be2dde1bccb132041723df8af7f5f36f24e133c 100644
--- a/python/paddle/tests/test_download.py
+++ b/python/paddle/tests/test_download.py
@@ -70,6 +70,13 @@ class TestDownload(unittest.TestCase):
         for url in urls:
             get_path_from_url(url, root_dir='./test')
 
+    def test_retry_exception(self, ):
+        with self.assertRaises(RuntimeError):
+            from paddle.utils.download import _download
+            _download(
+                'www.baidu.com',
+                './test', )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_hapi_amp.py b/python/paddle/tests/test_hapi_amp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecab4db7516d752c93c5ed74f28ba17232bec115
--- /dev/null
+++ b/python/paddle/tests/test_hapi_amp.py
@@ -0,0 +1,115 @@
+# copyright (c) 2020 paddlepaddle authors. all rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import fluid
+
+from paddle import Model
+from paddle.static import InputSpec
+from paddle.nn.layer.loss import CrossEntropyLoss
+from paddle.vision.models import LeNet
+
+
+@unittest.skipIf(not fluid.is_compiled_with_cuda(),
+                 'CPU testing is not supported')
+class TestDistTraningUsingAMP(unittest.TestCase):
+    def test_amp_training(self):
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        data = np.random.random(size=(4, 1, 28, 28)).astype(np.float32)
+        label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64)
+        amp_level = "O1"
+        for dynamic in [True, False]:
+            if not fluid.is_compiled_with_cuda():
+                self.skipTest('module not tested when ONLY_CPU compling')
+            paddle.enable_static() if not dynamic else None
+            paddle.set_device('gpu')
+            net = LeNet()
+            inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
+            labels = InputSpec([None, 1], "int64", "y")
+            model = Model(net, inputs, labels)
+            optim = paddle.optimizer.Adam(
+                learning_rate=0.001, parameters=model.parameters())
+            amp_configs = {"level": amp_level}
+            model.prepare(
+                optimizer=optim,
+                loss=CrossEntropyLoss(reduction="sum"),
+                amp_configs=amp_configs)
+            model.train_batch([data], [label])
+
+    def test_dynamic_check_input(self):
+        paddle.disable_static()
+        amp_configs_list = [
+            {
+                "level": "O3"
+            },
+            {
+                "level": "O1",
+                "test": 0
+            },
+            {
+                "level": "O1",
+                "use_fp16_guard": True
+            },
+            "O3",
+        ]
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        paddle.set_device('gpu')
+        net = LeNet()
+        model = Model(net)
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        loss = CrossEntropyLoss(reduction="sum")
+        with self.assertRaises(ValueError):
+            for amp_configs in amp_configs_list:
+                model.prepare(
+                    optimizer=optim, loss=loss, amp_configs=amp_configs)
+        model.prepare(optimizer=optim, loss=loss, amp_configs="O2")
+        model.prepare(
+            optimizer=optim,
+            loss=loss,
+            amp_configs={
+                "custom_white_list": {"matmul"},
+                "init_loss_scaling": 1.0
+            })
+
+    def test_static_check_input(self):
+        paddle.enable_static()
+        amp_configs = {"level": "O2", "use_pure_fp16": True}
+        if not fluid.is_compiled_with_cuda():
+            self.skipTest('module not tested when ONLY_CPU compling')
+        paddle.set_device('gpu')
+
+        net = LeNet()
+        inputs = InputSpec([None, 1, 28, 28], "float32", 'x')
+        labels = InputSpec([None, 1], "int64", "y")
+        model = Model(net, inputs, labels)
+
+        optim = paddle.optimizer.Adam(
+            learning_rate=0.001, parameters=model.parameters())
+        loss = CrossEntropyLoss(reduction="sum")
+        with self.assertRaises(ValueError):
+            model.prepare(optimizer=optim, loss=loss, amp_configs=amp_configs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_hapi_hub.py b/python/paddle/tests/test_hapi_hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..06000d6c83367e64b8aa83669f6cd68deb5ebb1c
--- /dev/null
+++ b/python/paddle/tests/test_hapi_hub.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+
+import paddle
+from paddle.hapi import hub
+
+import numpy as np
+
+
+class TestHub(unittest.TestCase):
+    def setUp(self, ):
+        self.local_repo = os.path.dirname(os.path.abspath(__file__))
+        self.github_repo = 'lyuwenyu/paddlehub_demo:main'
+
+    def testLoad(self, ):
+        model = hub.load(
+            self.local_repo, model='MM', source='local', out_channels=8)
+
+        data = paddle.rand((1, 3, 100, 100))
+        out = model(data)
+        np.testing.assert_equal(out.shape, [1, 8, 50, 50])
+
+        model = hub.load(
+            self.github_repo, model='MM', source='github', force_reload=True)
+
+        model = hub.load(
+            self.github_repo,
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=False)
+
+        model = hub.load(
+            self.github_repo.split(':')[0],
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=False)
+
+        model = hub.load(
+            self.github_repo,
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=True,
+            out_channels=8)
+
+        data = paddle.ones((1, 3, 2, 2))
+        out = model(data)
+
+        gt = np.array([
+            1.53965068, 0., 0., 1.39455748, 0.72066200, 0.19773030, 2.09201908,
+            0.37345418
+        ])
+        np.testing.assert_equal(out.shape, [1, 8, 1, 1])
+        np.testing.assert_almost_equal(
+            out.numpy(), gt.reshape(1, 8, 1, 1), decimal=5)
+
+    def testHelp(self, ):
+        docs1 = hub.help(
+            self.local_repo,
+            model='MM',
+            source='local', )
+
+        docs2 = hub.help(
+            self.github_repo, model='MM', source='github', force_reload=False)
+
+        assert docs1 == docs2 == 'This is a test demo for paddle hub\n    ', ''
+
+    def testList(self, ):
+        models1 = hub.list(
+            self.local_repo,
+            source='local',
+            force_reload=False, )
+
+        models2 = hub.list(
+            self.github_repo,
+            source='github',
+            force_reload=False, )
+
+        assert models1 == models2 == ['MM'], ''
+
+    def testExcept(self, ):
+        with self.assertRaises(ValueError):
+            _ = hub.help(
+                self.github_repo,
+                model='MM',
+                source='github-test',
+                force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.load(
+                self.github_repo,
+                model='MM',
+                source='github-test',
+                force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.list(
+                self.github_repo, source='github-test', force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.load(
+                self.local_repo, model=123, source='local', force_reload=False)
+
+        with self.assertRaises(RuntimeError):
+            _ = hub.load(
+                self.local_repo,
+                model='123',
+                source='local',
+                force_reload=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_hapi_hub_model.py b/python/paddle/tests/test_hapi_hub_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..774c7f6f33a65db29824ad4c9d93e15aa2253f9b
--- /dev/null
+++ b/python/paddle/tests/test_hapi_hub_model.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class MM(nn.Layer):
+    def __init__(self, out_channels):
+        super(MM, self).__init__()
+        self.conv = nn.Conv2D(3, out_channels, 3, 2, 1)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = F.relu(out)
+
+        return out
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index af54b046fe699fa29cf6948f990a5cb9d44ddcda..ae574a8241bfffccea7c9d0e7fe71a83a710e778 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -172,6 +172,12 @@ class TestModel(unittest.TestCase):
     def test_fit_static(self):
         self.fit(False)
 
+    def test_fit_dynamic_with_tuple_input(self):
+        self.fit_with_tuple_input(True)
+
+    def test_fit_static_with_tuple_input(self):
+        self.fit_with_tuple_input(False)
+
     def test_fit_dynamic_with_rank(self):
         self.fit(True, 2, 0)
 
@@ -240,6 +246,53 @@ class TestModel(unittest.TestCase):
         model.fit(train_loader, val_loader)
         fluid.disable_dygraph() if dynamic else None
 
+    def fit_with_tuple_input(self, dynamic, num_replicas=None, rank=None):
+        fluid.enable_dygraph(self.device) if dynamic else None
+        seed = 333
+        paddle.seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
+
+        net = LeNet()
+        optim_new = fluid.optimizer.Adam(
+            learning_rate=0.001, parameter_list=net.parameters())
+        model = Model(net, inputs=tuple(self.inputs), labels=tuple(self.labels))
+        model.prepare(
+            optim_new,
+            loss=CrossEntropyLoss(reduction="sum"),
+            metrics=Accuracy())
+        model.fit(self.train_dataset, batch_size=64, shuffle=False)
+
+        result = model.evaluate(self.val_dataset, batch_size=64)
+        np.testing.assert_allclose(result['acc'], self.acc1)
+
+        train_sampler = DistributedBatchSampler(
+            self.train_dataset,
+            batch_size=64,
+            shuffle=False,
+            num_replicas=num_replicas,
+            rank=rank)
+        val_sampler = DistributedBatchSampler(
+            self.val_dataset,
+            batch_size=64,
+            shuffle=False,
+            num_replicas=num_replicas,
+            rank=rank)
+
+        train_loader = fluid.io.DataLoader(
+            self.train_dataset,
+            batch_sampler=train_sampler,
+            places=self.device,
+            return_list=True)
+
+        val_loader = fluid.io.DataLoader(
+            self.val_dataset,
+            batch_sampler=val_sampler,
+            places=self.device,
+            return_list=True)
+
+        model.fit(train_loader, val_loader)
+        fluid.disable_dygraph() if dynamic else None
+
     def evaluate(self, dynamic):
         fluid.enable_dygraph(self.device) if dynamic else None
         model = Model(LeNet(), self.inputs, self.labels)
@@ -622,6 +675,8 @@ class TestModelFunction(unittest.TestCase):
             paddle.enable_static()
 
     def test_dygraph_export_deploy_model_about_inputs(self):
+        self.set_seed()
+        np.random.seed(201)
         mnist_data = MnistDataset(mode='train')
         paddle.disable_static()
         # without inputs
diff --git a/python/paddle/tests/test_read_file.py b/python/paddle/tests/test_read_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbcba9a6bbf7b4d99b6c50c60df463c10ad1328b
--- /dev/null
+++ b/python/paddle/tests/test_read_file.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import shutil
+import unittest
+import numpy as np
+
+import paddle
+from paddle.vision.ops import read_file, decode_jpeg
+
+
+class TestReadFile(unittest.TestCase):
+    def setUp(self):
+        fake_img = (np.random.random((400, 300, 3)) * 255).astype('uint8')
+        cv2.imwrite('fake.jpg', fake_img)
+
+    def tearDown(self):
+        os.remove('fake.jpg')
+
+    def read_file_decode_jpeg(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        img_bytes = read_file('fake.jpg')
+
+        img = decode_jpeg(img_bytes, mode='gray')
+        img = decode_jpeg(img_bytes, mode='rgb')
+
+        img = decode_jpeg(img_bytes)
+
+        img_cv2 = cv2.imread('fake.jpg')
+        if paddle.in_dynamic_mode():
+            np.testing.assert_equal(img.shape, img_cv2.transpose(2, 0, 1).shape)
+        else:
+            place = paddle.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(paddle.static.default_main_program(),
+                          fetch_list=[img])
+
+            np.testing.assert_equal(out[0].shape,
+                                    img_cv2.transpose(2, 0, 1).shape)
+
+    def test_read_file_decode_jpeg_dynamic(self):
+        self.read_file_decode_jpeg()
+
+    def test_read_file_decode_jpeg_static(self):
+        paddle.enable_static()
+        self.read_file_decode_jpeg()
+        paddle.disable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 47977bdf5352bb867b69ed648492fb9a060a13c9..974943a99d8b415546a8002924458fd301020f61 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -56,7 +56,10 @@ class TestTransformsCV2(unittest.TestCase):
                 'uint8'))
 
     def get_shape(self, img):
-        if self.backend == 'pil':
+        if isinstance(img, paddle.Tensor):
+            return img.shape
+
+        elif self.backend == 'pil':
             return np.array(img).shape
 
         return img.shape
@@ -253,6 +256,22 @@ class TestTransformsCV2(unittest.TestCase):
             fake_img = self.create_image((100, 120, 3))
             F.pad(fake_img, [1.0, 2.0, 3.0])
 
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, '1')
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, 1, {})
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, 1, padding_mode=-1)
+
+        with self.assertRaises(ValueError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, [1.0, 2.0, 3.0])
+
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
 
@@ -290,6 +309,159 @@ class TestTransformsPIL(TestTransformsCV2):
         return 'pil'
 
 
+class TestTransformsTensor(TestTransformsCV2):
+    def get_backend(self):
+        return 'tensor'
+
+    def create_image(self, shape):
+        return paddle.to_tensor(np.random.rand(*shape)).transpose(
+            (2, 0, 1))  # hwc->chw
+
+    def do_transform(self, trans):
+        trans.transforms.insert(0, transforms.ToTensor(data_format='CHW'))
+        trans.transforms.append(transforms.Transpose(order=(1, 2, 0)))
+        dataset_folder = DatasetFolder(self.data_dir, transform=trans)
+        for _ in dataset_folder:
+            pass
+
+    def test_trans_all(self):
+        normalize = transforms.Normalize(
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.120, 57.375], )
+        trans = transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            normalize,
+        ])
+        self.do_transform(trans)
+
+    def test_grayscale(self):
+        trans = transforms.Compose([transforms.Grayscale()])
+        self.do_transform(trans)
+
+        trans_gray = transforms.Grayscale()
+        fake_img = self.create_image((500, 400, 3))
+        fake_img_gray = trans_gray(fake_img)
+
+        np.testing.assert_equal(self.get_shape(fake_img_gray)[1], 500)
+        np.testing.assert_equal(self.get_shape(fake_img_gray)[2], 400)
+
+        trans_gray3 = transforms.Grayscale(3)
+        fake_img = self.create_image((500, 400, 3))
+        fake_img_gray = trans_gray3(fake_img)
+
+    def test_normalize(self):
+        normalize = transforms.Normalize(mean=0.5, std=0.5)
+        trans = transforms.Compose([normalize])
+        self.do_transform(trans)
+
+    def test_pad(self):
+        trans = transforms.Compose([transforms.Pad(2)])
+        self.do_transform(trans)
+
+        fake_img = self.create_image((200, 150, 3))
+        trans_pad = transforms.Compose([transforms.Pad(10)])
+        fake_img_padded = trans_pad(fake_img)
+        np.testing.assert_equal(self.get_shape(fake_img_padded), (3, 220, 170))
+        trans_pad1 = transforms.Pad([1, 2])
+        trans_pad2 = transforms.Pad([1, 2, 3, 4])
+        trans_pad4 = transforms.Pad(1, padding_mode='edge')
+        img = trans_pad1(fake_img)
+        img = trans_pad2(img)
+        img = trans_pad4(img)
+
+    def test_random_crop(self):
+        trans = transforms.Compose([
+            transforms.RandomCrop(200),
+            transforms.RandomCrop((140, 160)),
+        ])
+        self.do_transform(trans)
+
+        trans_random_crop1 = transforms.RandomCrop(224)
+        trans_random_crop2 = transforms.RandomCrop((140, 160))
+
+        fake_img = self.create_image((500, 400, 3))
+        fake_img_crop1 = trans_random_crop1(fake_img)
+        fake_img_crop2 = trans_random_crop2(fake_img_crop1)
+
+        np.testing.assert_equal(self.get_shape(fake_img_crop1), (3, 224, 224))
+
+        np.testing.assert_equal(self.get_shape(fake_img_crop2), (3, 140, 160))
+
+        trans_random_crop_same = transforms.RandomCrop((140, 160))
+        img = trans_random_crop_same(fake_img_crop2)
+
+        trans_random_crop_bigger = transforms.RandomCrop(
+            (180, 200), pad_if_needed=True)
+        img = trans_random_crop_bigger(img)
+
+        trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
+        img = trans_random_crop_pad(img)
+
+    def test_exception(self):
+        trans = transforms.Compose([transforms.Resize(-1)])
+
+        trans_batch = transforms.Compose([transforms.Resize(-1)])
+
+        with self.assertRaises(Exception):
+            self.do_transform(trans)
+
+        with self.assertRaises(Exception):
+            self.do_transform(trans_batch)
+
+        with self.assertRaises(ValueError):
+            transforms.Pad([1.0, 2.0, 3.0])
+
+        with self.assertRaises(TypeError):
+            fake_img = self.create_image((100, 120, 3))
+            F.pad(fake_img, '1')
+
+        with self.assertRaises(TypeError):
+            fake_img = self.create_image((100, 120, 3))
+            F.pad(fake_img, 1, {})
+
+        with self.assertRaises(TypeError):
+            fake_img = self.create_image((100, 120, 3))
+            F.pad(fake_img, 1, padding_mode=-1)
+
+        with self.assertRaises(ValueError):
+            fake_img = self.create_image((100, 120, 3))
+            F.pad(fake_img, [1.0, 2.0, 3.0])
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, '1')
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, 1, {})
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, 1, padding_mode=-1)
+
+        with self.assertRaises(ValueError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, [1.0, 2.0, 3.0])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomRotation(-2)
+
+        with self.assertRaises(ValueError):
+            transforms.RandomRotation([1, 2, 3])
+
+        with self.assertRaises(ValueError):
+            trans_gray = transforms.Grayscale(5)
+            fake_img = self.create_image((100, 120, 3))
+            trans_gray(fake_img)
+
+        with self.assertRaises(TypeError):
+            transform = transforms.RandomResizedCrop(64)
+            transform(1)
+
+    test_color_jitter = None
+
+
 class TestFunctional(unittest.TestCase):
     def test_errors(self):
         with self.assertRaises(TypeError):
@@ -300,6 +472,14 @@ class TestFunctional(unittest.TestCase):
                 'uint8'))
             F.to_tensor(fake_img, data_format=1)
 
+        with self.assertRaises(ValueError):
+            fake_img = paddle.rand((3, 100, 100))
+            F.pad(fake_img, 1, padding_mode='symmetric')
+
+        with self.assertRaises(TypeError):
+            fake_img = paddle.rand((3, 100, 100))
+            F.resize(fake_img, {1: 1})
+
         with self.assertRaises(TypeError):
             fake_img = Image.fromarray((np.random.rand(28, 28, 3) * 255).astype(
                 'uint8'))
@@ -345,54 +525,86 @@ class TestFunctional(unittest.TestCase):
             image_load('tmp.jpg', backend=1)
 
     def test_normalize(self):
-        np_img = (np.random.rand(28, 24, 3)).astype('uint8')
+        np_img = (np.random.rand(28, 24, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
         tensor_img = F.to_tensor(pil_img)
-        tensor_img_hwc = F.to_tensor(pil_img, data_format='HWC')
+        tensor_img_hwc = F.to_tensor(pil_img, data_format='HWC') * 255
 
         mean = [0.5, 0.5, 0.5]
         std = [0.5, 0.5, 0.5]
 
         normalized_img = F.normalize(tensor_img, mean, std)
-        normalized_img = F.normalize(
+        normalized_img_tensor = F.normalize(
             tensor_img_hwc, mean, std, data_format='HWC')
 
-        normalized_img = F.normalize(pil_img, mean, std, data_format='HWC')
-        normalized_img = F.normalize(
-            np_img, mean, std, data_format='HWC', to_rgb=True)
+        normalized_img_pil = F.normalize(pil_img, mean, std, data_format='HWC')
+        normalized_img_np = F.normalize(
+            np_img, mean, std, data_format='HWC', to_rgb=False)
+
+        np.testing.assert_almost_equal(
+            np.array(normalized_img_pil), normalized_img_np)
+        np.testing.assert_almost_equal(
+            normalized_img_tensor.numpy(), normalized_img_np, decimal=4)
 
     def test_center_crop(self):
-        np_img = (np.random.rand(28, 24, 3)).astype('uint8')
+        np_img = (np.random.rand(28, 24, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
+        tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
 
         np_cropped_img = F.center_crop(np_img, 4)
         pil_cropped_img = F.center_crop(pil_img, 4)
+        tensor_cropped_img = F.center_crop(tensor_img, 4)
 
         np.testing.assert_almost_equal(np_cropped_img,
                                        np.array(pil_cropped_img))
+        np.testing.assert_almost_equal(
+            np_cropped_img,
+            tensor_cropped_img.numpy().transpose((1, 2, 0)),
+            decimal=4)
 
     def test_pad(self):
-        np_img = (np.random.rand(28, 24, 3)).astype('uint8')
+        np_img = (np.random.rand(28, 24, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
+        tensor_img = F.to_tensor(pil_img, 'CHW') * 255
 
         np_padded_img = F.pad(np_img, [1, 2], padding_mode='reflect')
         pil_padded_img = F.pad(pil_img, [1, 2], padding_mode='reflect')
+        tensor_padded_img = F.pad(tensor_img, [1, 2], padding_mode='reflect')
 
         np.testing.assert_almost_equal(np_padded_img, np.array(pil_padded_img))
+        np.testing.assert_almost_equal(
+            np_padded_img,
+            tensor_padded_img.numpy().transpose((1, 2, 0)),
+            decimal=3)
+
+        tensor_padded_img = F.pad(tensor_img, 1, padding_mode='reflect')
+        tensor_padded_img = F.pad(tensor_img, [1, 2, 1, 2],
+                                  padding_mode='reflect')
 
         pil_p_img = pil_img.convert('P')
         pil_padded_img = F.pad(pil_p_img, [1, 2])
         pil_padded_img = F.pad(pil_p_img, [1, 2], padding_mode='reflect')
 
     def test_resize(self):
-        np_img = (np.zeros([28, 24, 3])).astype('uint8')
+        np_img = (np.zeros([28, 24, 3]) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img)
+        tensor_img = F.to_tensor(pil_img, 'CHW') * 255
 
         np_reseized_img = F.resize(np_img, 40)
         pil_reseized_img = F.resize(pil_img, 40)
+        tensor_reseized_img = F.resize(tensor_img, 40)
+        tensor_reseized_img2 = F.resize(tensor_img, (46, 40))
 
         np.testing.assert_almost_equal(np_reseized_img,
                                        np.array(pil_reseized_img))
+        np.testing.assert_almost_equal(
+            np_reseized_img,
+            tensor_reseized_img.numpy().transpose((1, 2, 0)),
+            decimal=3)
+        np.testing.assert_almost_equal(
+            np_reseized_img,
+            tensor_reseized_img2.numpy().transpose((1, 2, 0)),
+            decimal=3)
 
         gray_img = (np.zeros([28, 32])).astype('uint8')
         gray_resize_img = F.resize(gray_img, 40)
@@ -447,10 +659,34 @@ class TestFunctional(unittest.TestCase):
     def test_rotate(self):
         np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img).convert('RGB')
-
         rotated_np_img = F.rotate(np_img, 80, expand=True)
         rotated_pil_img = F.rotate(pil_img, 80, expand=True)
 
+        tensor_img = F.to_tensor(pil_img, 'CHW')
+
+        rotated_tensor_img1 = F.rotate(tensor_img, 80, expand=True)
+
+        rotated_tensor_img2 = F.rotate(
+            tensor_img,
+            80,
+            interpolation='bilinear',
+            center=(10, 10),
+            expand=False)
+
+        np.testing.assert_equal(rotated_np_img.shape,
+                                np.array(rotated_pil_img).shape)
+        np.testing.assert_equal(rotated_np_img.shape,
+                                rotated_tensor_img1.transpose((1, 2, 0)).shape)
+
+    def test_rotate1(self):
+        np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+
+        rotated_np_img = F.rotate(
+            np_img, 80, expand=True, center=[0, 0], fill=[0, 0, 0])
+        rotated_pil_img = F.rotate(
+            pil_img, 80, expand=True, center=[0, 0], fill=[0, 0, 0])
+
         np.testing.assert_equal(rotated_np_img.shape,
                                 np.array(rotated_pil_img).shape)
 
diff --git a/python/paddle/text/__init__.py b/python/paddle/text/__init__.py
index b6f8ea6bcc7e46790bbc036ee74df01349eff53c..00eaae5b29e93f52efa2438273b555dcf35e313b 100644
--- a/python/paddle/text/__init__.py
+++ b/python/paddle/text/__init__.py
@@ -12,7 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import datasets
-from .datasets import *
+from .datasets import Conll05st  # noqa: F401
+from .datasets import Imdb  # noqa: F401
+from .datasets import Imikolov  # noqa: F401
+from .datasets import Movielens  # noqa: F401
+from .datasets import UCIHousing  # noqa: F401
+from .datasets import WMT14  # noqa: F401
+from .datasets import WMT16  # noqa: F401
 
-__all__ = datasets.__all__
+
+__all__ = [ #noqa
+           'Conll05st',
+           'Imdb',
+           'Imikolov',
+           'Movielens',
+           'UCIHousing',
+           'WMT14',
+           'WMT16'
+]
diff --git a/python/paddle/text/datasets/__init__.py b/python/paddle/text/datasets/__init__.py
index 71571d09b5c2bde8ba970624195973d2a1771789..118917049928bf2f191d49bedc9c3bee5095004b 100644
--- a/python/paddle/text/datasets/__init__.py
+++ b/python/paddle/text/datasets/__init__.py
@@ -12,26 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import conll05
-from . import imdb
-from . import imikolov
-from . import movielens
-from . import uci_housing
-from . import wmt14
-from . import wmt16
+from .conll05 import Conll05st  # noqa: F401
+from .imdb import Imdb  # noqa: F401
+from .imikolov import Imikolov  # noqa: F401
+from .movielens import Movielens  # noqa: F401
+from .uci_housing import UCIHousing  # noqa: F401
+from .wmt14 import WMT14  # noqa: F401
+from .wmt16 import WMT16  # noqa: F401
 
-from .conll05 import *
-from .imdb import *
-from .imikolov import *
-from .movielens import *
-from .uci_housing import *
-from .wmt14 import *
-from .wmt16 import *
-
-__all__ = conll05.__all__ \
-          + imdb.__all__ \
-          + imikolov.__all__ \
-          + movielens.__all__ \
-          + uci_housing.__all__ \
-          + wmt14.__all__ \
-          + wmt16.__all__
+__all__ = []
diff --git a/python/paddle/text/datasets/conll05.py b/python/paddle/text/datasets/conll05.py
index 23a2f1c8f28a5529597197dc4576db31ecdace0d..7dd29637706f321df324bda429b930289ded760e 100644
--- a/python/paddle/text/datasets/conll05.py
+++ b/python/paddle/text/datasets/conll05.py
@@ -24,7 +24,7 @@ from paddle.io import Dataset
 import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['Conll05st']
+__all__ = []
 
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
diff --git a/python/paddle/text/datasets/imdb.py b/python/paddle/text/datasets/imdb.py
index 142c70c953b4d25cff60689097865808dc1bcd48..f4fe7eb174bb7592c86d6c9cd89d646f5b22f6e4 100644
--- a/python/paddle/text/datasets/imdb.py
+++ b/python/paddle/text/datasets/imdb.py
@@ -24,7 +24,7 @@ import collections
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['Imdb']
+__all__ = []
 
 URL = 'https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
diff --git a/python/paddle/text/datasets/imikolov.py b/python/paddle/text/datasets/imikolov.py
index 1a1c625f6058e6fb5d7e41aceb7a68c5c47f0a6a..9c84669d6b8d8d05ccb77dbd7a3188ed923d5dd5 100644
--- a/python/paddle/text/datasets/imikolov.py
+++ b/python/paddle/text/datasets/imikolov.py
@@ -22,7 +22,7 @@ import collections
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['Imikolov']
+__all__ = []
 
 URL = 'https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz'
 MD5 = '30177ea32e27c525793142b6bf2c8e2d'
diff --git a/python/paddle/text/datasets/movielens.py b/python/paddle/text/datasets/movielens.py
index 1f399eebd3b529d4bbe594ed56fe5bd82e6f5c30..798a7c590e17b675634a9b365d4eb0d68c677d52 100644
--- a/python/paddle/text/datasets/movielens.py
+++ b/python/paddle/text/datasets/movielens.py
@@ -26,7 +26,7 @@ from paddle.io import Dataset
 import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['Movielens']
+__all__ = []
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
 
diff --git a/python/paddle/text/datasets/uci_housing.py b/python/paddle/text/datasets/uci_housing.py
index a8dfbc44a97127dd074ef5cbfc727aa535d56872..597b1e1e8185ef637221760365d4dcf33f8d1ea7 100644
--- a/python/paddle/text/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
@@ -21,7 +21,7 @@ import paddle
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ["UCIHousing"]
+__all__ = []
 
 URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
diff --git a/python/paddle/text/datasets/wmt14.py b/python/paddle/text/datasets/wmt14.py
index b080824d72410e3f6a66f35a9368d2e0960d95b5..424a564216d19072d3927358fc1ff9c3a3af307b 100644
--- a/python/paddle/text/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
@@ -22,7 +22,7 @@ from paddle.io import Dataset
 import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['WMT14']
+__all__ = []
 
 URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
                 'cslm_joint_paper/data/dev+test.tgz')
diff --git a/python/paddle/text/datasets/wmt16.py b/python/paddle/text/datasets/wmt16.py
index 03a62e9347035101f77cec971c32164b97dd844f..f95cbe771cadc834a4de697660caa22a0729521e 100644
--- a/python/paddle/text/datasets/wmt16.py
+++ b/python/paddle/text/datasets/wmt16.py
@@ -27,7 +27,7 @@ from paddle.io import Dataset
 import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['WMT16']
+__all__ = []
 
 DATA_URL = ("http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz")
 DATA_MD5 = "0c38be43600334966403524a40dcd81e"
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 1db1b66426c8313f66e16081a55e420a2e34138b..c23841ea8b802bfec28c2b293d6ac0d57f423d5b 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,22 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .profiler import ProfilerOptions
-from .profiler import Profiler
-from .profiler import get_profiler
-from .deprecated import deprecated
-from .lazy_import import try_import
-from .op_version import OpLastCheckpointChecker
-from .install_check import run_check
-from ..fluid.framework import unique_name
-from ..fluid.framework import load_op_library
-from ..fluid.framework import require_version
+from .profiler import ProfilerOptions  # noqa: F401
+from .profiler import Profiler  # noqa: F401
+from .profiler import get_profiler  # noqa: F401
+from .deprecated import deprecated  # noqa: F401
+from .lazy_import import try_import  # noqa: F401
+from .op_version import OpLastCheckpointChecker  # noqa: F401
+from .install_check import run_check  # noqa: F401
+from . import unique_name  # noqa: F401
+from ..fluid.framework import require_version  # noqa: F401
 
-from . import download
+from . import download  # noqa: F401
+from . import image_util  # noqa: F401
+from . import cpp_extension  # noqa: F401
 
-from . import cpp_extension
-
-__all__ = ['dump_config', 'deprecated', 'download', 'run_check']
-
-#TODO: define new api under this directory
-__all__ += ['unique_name', 'load_op_library', 'require_version']
+__all__ = [  #noqa
+    'deprecated', 'run_check', 'require_version', 'try_import'
+]
diff --git a/python/paddle/utils/cpp_extension/__init__.py b/python/paddle/utils/cpp_extension/__init__.py
index 130ab79b3038df026b3eeabcef45eae192aba78c..cef2716b7f39604a268b259f4af883fad71bce16 100644
--- a/python/paddle/utils/cpp_extension/__init__.py
+++ b/python/paddle/utils/cpp_extension/__init__.py
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .cpp_extension import CUDAExtension
-from .cpp_extension import CppExtension
-from .cpp_extension import BuildExtension
-from .cpp_extension import load, setup
+from .cpp_extension import CUDAExtension  # noqa: F401
+from .cpp_extension import CppExtension  # noqa: F401
+from .cpp_extension import BuildExtension  # noqa: F401
+from .cpp_extension import load  # noqa: F401
+from .cpp_extension import setup  # noqa: F401
 
-from .extension_utils import parse_op_info
-from .extension_utils import get_build_directory
-from .extension_utils import load_op_meta_info_and_register_op
+from .extension_utils import parse_op_info  # noqa: F401
+from .extension_utils import get_build_directory  # noqa: F401
+from .extension_utils import load_op_meta_info_and_register_op  # noqa: F401
 
-from . import cpp_extension
-from . import extension_utils
-
-__all__ = [
-    'CppExtension', 'CUDAExtension', 'load', 'setup', 'get_build_directory'
+__all__ = [ #noqa
+        'CppExtension',
+        'CUDAExtension',
+        'load',
+        'setup',
+        'get_build_directory'
 ]
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index d84ae67fff8d6744527a14cc4b6bfe5d20f977e1..6045ac7d1e7274dc283206c5533b46170c05a621 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -22,14 +22,15 @@ from setuptools.command.easy_install import easy_install
 from setuptools.command.build_ext import build_ext
 from distutils.command.build import build
 
-from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension_kwargs, add_compile_flag
+from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension_kwargs, add_compile_flag, run_cmd
 from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags
 from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile
 from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from
-from .extension_utils import use_new_custom_op_load_method, clean_object_if_change_cflags
+from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath, _get_fluid_path
 from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat
 
 from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS
+from .extension_utils import CLANG_COMPILE_FLAGS, CLANG_LINK_FLAGS
 
 from ...fluid import core
 
@@ -50,14 +51,14 @@ else:
 def setup(**attr):
     """
     The interface is used to config the process of compiling customized operators,
-    mainly includes how to complile shared library, automatically generate python API 
+    mainly includes how to compile shared library, automatically generate python API 
     and install it into site-package. It supports using customized operators directly with
     ``import`` statement.
 
     It encapsulates the python built-in ``setuptools.setup`` function and keeps arguments
     and usage same as the native interface. Meanwhile, it hiddens Paddle inner framework
     concepts, such as necessary compiling flags, included paths of head files, and linking
-    flags. It also will automatically search and valid local enviromment and versions of 
+    flags. It also will automatically search and valid local environment and versions of 
     ``cc(Linux)`` , ``cl.exe(Windows)`` and ``nvcc`` , then compiles customized operators 
     supporting CPU or GPU device according to the specified Extension type.
 
@@ -67,18 +68,18 @@ def setup(**attr):
 
     For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
     then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it shoule be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2015 update3). 
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
+    PaddlePaddle (Visual Studio 2017). 
     If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
     occur because of ABI compatibility.
 
     .. note::
         
-        1. Currently we support Linux and Windows platfrom. MacOS is supporting...
+        1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
            Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
            GCC version.
-        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3).
+        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
 
     Compared with Just-In-Time ``load`` interface, it only compiles once by executing
@@ -130,7 +131,7 @@ def setup(**attr):
         ext_modules(Extension): Specify the Extension instance including customized operator source files, compiling flags et.al. 
                                 If only compile operator supporting CPU device, please use ``CppExtension`` ; If compile operator
                                 supporting CPU and GPU devices, please use ``CUDAExtension`` .
-        include_dirs(list[str], optional): Specify the extra include directoies to search head files. The interface will automatically add
+        include_dirs(list[str], optional): Specify the extra include directories to search head files. The interface will automatically add
                                  ``site-package/paddle/include`` . Please add the corresponding directory path if including third-party
                                  head files. Default is None.
         extra_compile_args(list[str] | dict, optional): Specify the extra compiling flags such as ``-O3`` . If set ``list[str]`` , all these flags
@@ -158,7 +159,7 @@ def setup(**attr):
         setup(name='custom_module',
               ext_modules=CUDAExtension(
               sources=['relu_op.cc', 'relu_op.cu'])
-        
+
         # After running `python setup.py install`
         from custom_module import relu
     """
@@ -209,7 +210,7 @@ def CppExtension(sources, *args, **kwargs):
     Op Kernel only supporting CPU device. Please use ``CUDAExtension`` if you want to
     compile Op Kernel that supports both CPU and GPU devices.
 
-    It furtherly encapsulates python built-in ``setuptools.Extension`` .The arguments and
+    It further encapsulates python built-in ``setuptools.Extension`` .The arguments and
     usage are same as the native interface, except for no need to explicitly specify
     ``name`` .
 
@@ -259,7 +260,7 @@ def CUDAExtension(sources, *args, **kwargs):
     Op Kernel supporting both CPU and GPU devices. Please use ``CppExtension`` if you want to
     compile Op Kernel that supports only CPU device.
 
-    It furtherly encapsulates python built-in ``setuptools.Extension`` .The arguments and
+    It further encapsulates python built-in ``setuptools.Extension`` .The arguments and
     usage are same as the native interface, except for no need to explicitly specify
     ``name`` .
 
@@ -367,11 +368,14 @@ class BuildExtension(build_ext, object):
             self.build_lib = self.output_dir
 
     def build_extensions(self):
+        if OS_NAME.startswith("darwin"):
+            self._valid_clang_compiler()
+
         self._check_abi()
 
         # Note(Aurelius84): If already compiling source before, we should check whether
         # cflags have changed and delete the built shared library to re-compile the source
-        # even though source file content keep unchanaged.
+        # even though source file content keep unchanged.
         so_name = self.get_ext_fullpath(self.extensions[0].name)
         clean_object_if_change_cflags(
             os.path.abspath(so_name), self.extensions[0])
@@ -397,17 +401,21 @@ class BuildExtension(build_ext, object):
             cflags = copy.deepcopy(extra_postargs)
             try:
                 original_compiler = self.compiler.compiler_so
-                # ncvv compile CUDA source
+                # nvcc compile CUDA source
                 if is_cuda_file(src):
                     if core.is_compiled_with_rocm():
-                        assert ROCM_HOME is not None
+                        assert ROCM_HOME is not None, "Not found ROCM runtime, \
+                            please use `export ROCM_PATH= XXX` to specify it."
+
                         hipcc_cmd = os.path.join(ROCM_HOME, 'bin', 'hipcc')
                         self.compiler.set_executable('compiler_so', hipcc_cmd)
                         # {'nvcc': {}, 'cxx: {}}
                         if isinstance(cflags, dict):
                             cflags = cflags['hipcc']
                     else:
-                        assert CUDA_HOME is not None
+                        assert CUDA_HOME is not None, "Not found CUDA runtime, \
+                            please use `export CUDA_HOME= XXX` to specify it."
+
                         nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
                         self.compiler.set_executable('compiler_so', nvcc_cmd)
                         # {'nvcc': {}, 'cxx: {}}
@@ -424,7 +432,7 @@ class BuildExtension(build_ext, object):
                 original_compile(obj, src, ext, cc_args, cflags, pp_opts)
             finally:
                 # restore original_compiler
-                self.compiler.compiler_so = original_compiler
+                self.compiler.set_executable('compiler_so', original_compiler)
 
         def win_custom_single_compiler(sources,
                                        output_dir=None,
@@ -470,7 +478,9 @@ class BuildExtension(build_ext, object):
                 src = src_list[0]
                 obj = obj_list[0]
                 if is_cuda_file(src):
-                    assert CUDA_HOME is not None
+                    assert CUDA_HOME is not None, "Not found CUDA runtime, \
+                        please use `export CUDA_HOME= XXX` to specify it."
+
                     nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
                     if isinstance(self.cflags, dict):
                         cflags = self.cflags['nvcc']
@@ -548,22 +558,42 @@ class BuildExtension(build_ext, object):
         print("Compiling user custom op, it will cost a few seconds.....")
         build_ext.build_extensions(self)
 
+        # Reset runtime library path on MacOS platform
+        so_path = self.get_ext_fullpath(self.extensions[0]._full_name)
+        _reset_so_rpath(so_path)
+
     def get_ext_filename(self, fullname):
         # for example: custommed_extension.cpython-37m-x86_64-linux-gnu.so
         ext_name = super(BuildExtension, self).get_ext_filename(fullname)
+        split_str = '.'
+        name_items = ext_name.split(split_str)
         if self.no_python_abi_suffix and six.PY3:
-            split_str = '.'
-            name_items = ext_name.split(split_str)
             assert len(
                 name_items
             ) > 2, "Expected len(name_items) > 2, but received {}".format(
                 len(name_items))
             name_items.pop(-2)
-            # custommed_extension.so
             ext_name = split_str.join(name_items)
 
+        # custommed_extension.dylib
+        if OS_NAME.startswith('darwin'):
+            name_items[-1] = 'dylib'
+            ext_name = split_str.join(name_items)
         return ext_name
 
+    def _valid_clang_compiler(self):
+        """
+        Make sure to use Clang as compiler on Mac platform
+        """
+        compiler_infos = ['clang'] + CLANG_COMPILE_FLAGS
+        linker_infos = ['clang'] + CLANG_LINK_FLAGS
+        self.compiler.set_executables(
+            compiler=compiler_infos,
+            compiler_so=compiler_infos,
+            compiler_cxx=['clang'],
+            linker_exe=['clang'],
+            linker_so=linker_infos)
+
     def _check_abi(self):
         """
         Check ABI Compatibility.
@@ -581,7 +611,7 @@ class BuildExtension(build_ext, object):
             msg = (
                 'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
                 'This may lead to multiple activations of the VC env.'
-                'Please set `DISTUTILS_USE_SDK=1` and try again.')
+                'Please run `set DISTUTILS_USE_SDK=1` and try again.')
             raise UserWarning(msg)
 
     def _record_op_info(self):
@@ -628,6 +658,8 @@ class EasyInstallCommand(easy_install, object):
             will_rename = False
             if OS_NAME.startswith('linux') and ext == '.so':
                 will_rename = True
+            elif OS_NAME.startswith('darwin') and ext == '.dylib':
+                will_rename = True
             elif IS_WINDOWS and ext == '.pyd':
                 will_rename = True
 
@@ -692,7 +724,7 @@ def load(name,
     processes under a individual subprocess. It does not require CMake or Ninja 
     environment. On Linux platform, it requires GCC compiler whose version is 
     greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows 
-    platform, it requires Visual Studio whose version is greater than 2015 update3.
+    platform, it requires Visual Studio whose version is greater than 2017.
     On MacOS, clang++ is requited. In addition, if compiling Operators supporting 
     GPU device, please make sure ``nvcc`` compiler is installed in local environment.
     
@@ -702,8 +734,8 @@ def load(name,
 
     For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
     then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it shoule be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2015 update3). 
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
+    PaddlePaddle (Visual Studio 2017). 
     If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
     occur because of ABI compatibility.
 
@@ -713,11 +745,11 @@ def load(name,
 
     .. note::
 
-        1. Currently we support Linux and Windows platfrom. MacOS is supporting...
+        1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
            Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
            GCC version.
-        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3).
+        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
 
     **A simple example:**
@@ -729,7 +761,7 @@ def load(name,
 
         custom_op_module = load(
             name="op_shared_libary_name",                # name of shared library
-            sources=['relu_op.cc', 'relu_op.cu'],        # source files of cusomized op
+            sources=['relu_op.cc', 'relu_op.cu'],        # source files of customized op
             extra_cxx_cflags=['-g', '-w'],               # optional, specify extra flags to compile .cc/.cpp file
             extra_cuda_cflags=['-O2'],                   # optional, specify extra flags to compile .cu file
             verbose=True                                 # optional, specify to output log information
@@ -761,7 +793,7 @@ def load(name,
         verbose(bool, optional): whether to verbose compiled log information. Default is False
 
     Returns:
-        Moudle: A callable python module contains all CustomOp Layer APIs.
+        Module: A callable python module contains all CustomOp Layer APIs.
 
     """
 
@@ -770,9 +802,6 @@ def load(name,
 
     # ensure to use abs path
     build_directory = os.path.abspath(build_directory)
-    # Will load shared library from 'path' on windows
-    if IS_WINDOWS:
-        os.environ['path'] = build_directory + ';' + os.environ['path']
 
     log_v("build_directory: {}".format(build_directory), verbose)
 
@@ -795,6 +824,7 @@ def load(name,
 
     # write setup.py file and compile it
     build_base_dir = os.path.join(build_directory, name)
+
     _write_setup_file(name, sources, file_path, build_base_dir,
                       extra_include_paths, extra_cxx_cflags, extra_cuda_cflags,
                       extra_ldflags, verbose)
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index fff92d85c8f9588efd96182613a50f7c53cff14e..ea46ea8b39195aeed59f1f2b8ce8048a86fa0aab 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -28,14 +28,16 @@ import subprocess
 from contextlib import contextmanager
 from setuptools.command import bdist_egg
 
-from .. import load_op_library
 from ...fluid import core
 from ...fluid.framework import OpProtoHolder
 from ...sysconfig import get_include, get_lib
 
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 logger = logging.getLogger("utils.cpp_extension")
+logger.setLevel(logging.INFO)
+formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
 
 OS_NAME = sys.platform
 IS_WINDOWS = OS_NAME.startswith('win')
@@ -45,8 +47,15 @@ MSVC_COMPILE_FLAGS = [
     '/wd4190', '/EHsc', '/w', '/DGOOGLE_GLOG_DLL_DECL',
     '/DBOOST_HAS_STATIC_ASSERT', '/DNDEBUG', '/DPADDLE_USE_DSO'
 ]
+CLANG_COMPILE_FLAGS = [
+    '-fno-common', '-dynamic', '-DNDEBUG', '-g', '-fwrapv', '-O3', '-arch',
+    'x86_64'
+]
+CLANG_LINK_FLAGS = [
+    '-dynamiclib', '-undefined', 'dynamic_lookup', '-arch', 'x86_64'
+]
 
-MSVC_LINK_FLAGS = ['/MACHINE:X64', 'paddle_custom_op.lib']
+MSVC_LINK_FLAGS = ['/MACHINE:X64']
 
 COMMON_NVCC_FLAGS = ['-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU']
 
@@ -86,7 +95,6 @@ information
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 '''
-USING_NEW_CUSTOM_OP_LOAD_METHOD = True
 
 DEFAULT_OP_ATTR_NAMES = [
     core.op_proto_and_checker_maker.kOpRoleAttrName(),
@@ -97,18 +105,6 @@ DEFAULT_OP_ATTR_NAMES = [
 ]
 
 
-# NOTE(chenweihang): In order to be compatible with
-# the two custom op define method, after removing
-# old method, we can remove them together
-def use_new_custom_op_load_method(*args):
-    global USING_NEW_CUSTOM_OP_LOAD_METHOD
-    if len(args) == 0:
-        return USING_NEW_CUSTOM_OP_LOAD_METHOD
-    else:
-        assert len(args) == 1 and isinstance(args[0], bool)
-        USING_NEW_CUSTOM_OP_LOAD_METHOD = args[0]
-
-
 @contextmanager
 def bootstrap_context():
     """
@@ -122,10 +118,7 @@ def bootstrap_context():
 
 
 def load_op_meta_info_and_register_op(lib_filename):
-    if USING_NEW_CUSTOM_OP_LOAD_METHOD:
-        core.load_op_meta_info_and_register_op(lib_filename)
-    else:
-        core.load_op_library(lib_filename)
+    core.load_op_meta_info_and_register_op(lib_filename)
     return OpProtoHolder.instance().update_op_proto()
 
 
@@ -264,7 +257,7 @@ class VersionManager:
 def combine_hash(md5, value):
     """
     Return new hash value.
-    DO NOT use `hash()` beacuse it doesn't generate stable value between different process.
+    DO NOT use `hash()` because it doesn't generate stable value between different process.
     See https://stackoverflow.com/questions/27522626/hash-function-in-python-3-3-returns-different-results-between-sessions
     """
     md5.update(repr(value).encode())
@@ -303,13 +296,13 @@ def clean_object_if_change_cflags(so_path, extension):
     if os.path.exists(so_path) and os.path.exists(version_file):
         old_version_info = deserialize(version_file)
         so_version = old_version_info.get(so_name, None)
-        # delete shared library file if versison is changed to re-compile it.
+        # delete shared library file if version is changed to re-compile it.
         if so_version is not None and so_version != versioner.version:
             log_v(
                 "Re-Compiling {}, because specified cflags have been changed. New signature {} has been saved into {}.".
                 format(so_name, versioner.version, version_file))
             os.remove(so_path)
-            # upate new version information
+            # update new version information
             new_version_info = versioner.details
             new_version_info[so_name] = versioner.version
             serialize(version_file, new_version_info)
@@ -365,6 +358,64 @@ def get_cuda_arch_flags(cflags):
     return []
 
 
+def _get_fluid_path():
+    """
+    Return installed fluid dir path.
+    """
+    import paddle
+    return os.path.join(os.path.dirname(paddle.__file__), 'fluid')
+
+
+def _get_core_name():
+    """
+    Return pybind DSO module name.
+    """
+    import paddle
+    ext_name = '.pyd' if IS_WINDOWS else '.so'
+    if not paddle.fluid.core.load_noavx:
+        return 'core_avx' + ext_name
+    else:
+        return 'core_noavx' + ext_name
+
+
+def _get_lib_core_path():
+    """
+    Return real path of libcore_(no)avx.dylib on MacOS.
+    """
+    raw_core_name = _get_core_name()
+    lib_core_name = "lib{}.dylib".format(raw_core_name[:-3])
+    return os.path.join(_get_fluid_path(), lib_core_name)
+
+
+def _get_dll_core_path():
+    """
+    Return real path of libcore_(no)avx.dylib on Windows.
+    """
+    raw_core_name = _get_core_name()
+    dll_core_name = "paddle_pybind.dll"
+    return os.path.join(_get_fluid_path(), dll_core_name)
+
+
+def _reset_so_rpath(so_path):
+    """
+    NOTE(Aurelius84): Runtime path of core_(no)avx.so is modified into `@loader_path/../libs`
+    in setup.py.in. While loading custom op, `@loader_path` is the dirname of custom op
+    instead of `paddle/fluid`. So we modify `@loader_path` from custom dylib into `@rpath`
+    to ensure dynamic loader find it correctly.
+
+    Moreover, we will add `-rpath site-packages/paddle/fluid` while linking the dylib so
+    that we don't need to set `LD_LIBRARY_PATH` any more.
+    """
+    assert os.path.exists(so_path)
+    if OS_NAME.startswith("darwin"):
+        origin_runtime_path = "@loader_path/../libs/"
+        rpath = "@rpath/{}".format(_get_core_name())
+        cmd = 'install_name_tool -change {} {} {}'.format(origin_runtime_path,
+                                                          rpath, so_path)
+
+        run_cmd(cmd)
+
+
 def normalize_extension_kwargs(kwargs, use_cuda=False):
     """
     Normalize include_dirs, library_dir and other attributes in kwargs.
@@ -394,22 +445,35 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
         # append link flags
         extra_link_args = kwargs.get('extra_link_args', [])
         extra_link_args.extend(MSVC_LINK_FLAGS)
+        lib_core_name = create_sym_link_if_not_exist()
+        extra_link_args.append('{}'.format(lib_core_name))
         if use_cuda:
             extra_link_args.extend(['cudadevrt.lib', 'cudart_static.lib'])
         kwargs['extra_link_args'] = extra_link_args
+
     else:
+        ########################### Linux Platform ###########################
+        extra_link_args = kwargs.get('extra_link_args', [])
+        # On Linux, GCC support '-l:xxx.so' to specify the library name
+        # without `lib` prefix.
+        if OS_NAME.startswith('linux'):
+            extra_link_args.append('-l:{}'.format(_get_core_name()))
+        ########################### MacOS Platform ###########################
+        else:
+            # See _reset_so_rpath for details.
+            extra_link_args.append('-Wl,-rpath,{}'.format(_get_fluid_path()))
+            # On MacOS, ld don't support `-l:xx`, so we create a
+            # libcore_avx.dylib symbol link.
+            lib_core_name = create_sym_link_if_not_exist()
+            extra_link_args.append('-l{}'.format(lib_core_name))
+        ###########################   -- END --    ###########################
+
         add_compile_flag(extra_compile_args, ['-w'])  # disable warning
         # Note(Aurelius84): This marco will impact memory layout of `Tensor`.
-        # We align it automatially with pre-installed Paddle.
+        # We align it automatically with pre-installed Paddle.
         if core.is_compiled_with_mkldnn():
             add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_MKLDNN'])
 
-        # append link flags
-        extra_link_args = kwargs.get('extra_link_args', [])
-        if use_new_custom_op_load_method():
-            extra_link_args.append('-lpaddle_custom_op')
-        else:
-            extra_link_args.append('-lpaddle_framework')
         if use_cuda:
             extra_link_args.append('-lcudart')
 
@@ -426,6 +490,47 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
     return kwargs
 
 
+def create_sym_link_if_not_exist():
+    """
+    Create soft symbol link of `core_avx.so` or `core_noavx.so`
+    """
+    assert OS_NAME.startswith('darwin') or IS_WINDOWS
+
+    raw_core_name = _get_core_name()
+    core_path = os.path.join(_get_fluid_path(), raw_core_name)
+    if IS_WINDOWS:
+        new_dll_core_path = _get_dll_core_path()
+        # create symbol link on windows
+        if not os.path.exists(new_dll_core_path):
+            try:
+                os.symlink(core_path, new_dll_core_path)
+            except Exception:
+                warnings.warn(
+                    "Failed to create soft symbol link for {}.\n You can run prompt as administrator and execute the "
+                    "following command manually: `mklink {} {}`. Now it will create hard link for {} trickly.".
+                    format(raw_core_name, new_dll_core_path, core_path,
+                           raw_core_name))
+                run_cmd('mklink /H {} {}'.format(new_dll_core_path, core_path))
+        # core_avx or core_noavx with lib suffix
+        assert os.path.exists(new_dll_core_path)
+        return raw_core_name[:-4] + ".lib"
+
+    else:
+        new_lib_core_path = _get_lib_core_path()
+        # create symbol link on mac
+        if not os.path.exists(new_lib_core_path):
+            try:
+                os.symlink(core_path, new_lib_core_path)
+                assert os.path.exists(new_lib_core_path)
+            except Exception:
+                raise RuntimeError(
+                    "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".
+                    format(raw_core_name, core_path, new_lib_core_path))
+
+        # core_avx or core_noavx without suffix
+        return raw_core_name[:-3]
+
+
 def find_cuda_home():
     """
     Use heuristic method to find cuda path
@@ -442,7 +547,8 @@ def find_cuda_home():
                     [which_cmd, 'nvcc'], stderr=devnull)
                 if six.PY3:
                     nvcc_path = nvcc_path.decode()
-                nvcc_path = nvcc_path.rstrip('\r\n')
+                # Multi CUDA, select the first
+                nvcc_path = nvcc_path.split('\r\n')[0]
 
                 # for example: /usr/local/cuda/bin/nvcc
                 cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
@@ -460,9 +566,6 @@ def find_cuda_home():
     if cuda_home and not os.path.exists(
             cuda_home) and core.is_compiled_with_cuda():
         cuda_home = None
-        warnings.warn(
-            "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
-        )
 
     return cuda_home
 
@@ -493,9 +596,6 @@ def find_rocm_home():
     if rocm_home and not os.path.exists(
             rocm_home) and core.is_compiled_with_rocm():
         rocm_home = None
-        warnings.warn(
-            "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it."
-        )
 
     return rocm_home
 
@@ -543,9 +643,36 @@ def find_paddle_includes(use_cuda=False):
             cuda_include_dir = find_cuda_includes()
             include_dirs.extend(cuda_include_dir)
 
+    if OS_NAME.startswith('darwin'):
+        # NOTE(Aurelius84): Ensure to find std v1 headers correctly.
+        std_v1_includes = find_clang_cpp_include()
+        if std_v1_includes is not None and os.path.exists(std_v1_includes):
+            include_dirs.append(std_v1_includes)
+
     return include_dirs
 
 
+def find_clang_cpp_include(compiler='clang'):
+    std_v1_includes = None
+    try:
+        compiler_version = subprocess.check_output([compiler, "--version"])
+        if six.PY3:
+            compiler_version = compiler_version.decode()
+        infos = compiler_version.split("\n")
+        for info in infos:
+            if "InstalledDir" in info:
+                v1_path = info.split(':')[-1].strip()
+                if v1_path and os.path.exists(v1_path):
+                    std_v1_includes = os.path.join(
+                        os.path.dirname(v1_path), 'include/c++/v1')
+    except Exception:
+        # Just raise warnings because the include dir is not required.
+        warnings.warn(
+            "Failed to search `include/c++/v1/` include dirs. Don't worry because it's not required."
+        )
+    return std_v1_includes
+
+
 def find_cuda_libraries():
     """
     Use heuristic method to find cuda static lib path
@@ -592,6 +719,9 @@ def find_paddle_libraries(use_cuda=False):
             cuda_lib_dir = find_cuda_libraries()
             paddle_lib_dirs.extend(cuda_lib_dir)
 
+    # add `paddle/fluid` to search `core_avx.so` or `core_noavx.so`
+    paddle_lib_dirs.append(_get_fluid_path())
+
     return paddle_lib_dirs
 
 
@@ -639,9 +769,6 @@ def get_build_directory(verbose=False):
         if IS_WINDOWS:
             root_extensions_directory = os.path.normpath(
                 root_extensions_directory)
-        elif OS_NAME.startswith('darwin'):
-            # TODO(Aurelius84): consider macOs
-            raise NotImplementedError("Not support Mac now.")
 
         log_v("$PADDLE_EXTENSION_DIR is not set, using path: {} by default.".
               format(root_extensions_directory), verbose)
@@ -679,6 +806,8 @@ def _import_module_from_library(module_name, build_directory, verbose=False):
     """
     if IS_WINDOWS:
         dynamic_suffix = '.pyd'
+    elif OS_NAME.startswith('darwin'):
+        dynamic_suffix = '.dylib'
     else:
         dynamic_suffix = '.so'
     ext_path = os.path.join(build_directory, module_name + dynamic_suffix)
@@ -719,22 +848,29 @@ def _custom_api_content(op_name):
     params_str, ins_str, attrs_str, outs_str = _get_api_inputs_str(op_name)
 
     API_TEMPLATE = textwrap.dedent("""
+        from paddle.fluid.core import VarBase
+        from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer
         from paddle.fluid.layer_helper import LayerHelper
 
         def {op_name}({inputs}):
-            helper = LayerHelper("{op_name}", **locals())
-
             # prepare inputs and outputs
             ins = {ins}
             attrs = {attrs}
             outs = {{}}
             out_names = {out_names}
-            for out_name in out_names:
-                # Set 'float32' temporarily, and the actual dtype of output variable will be inferred
-                # in runtime.
-                outs[out_name] = helper.create_variable(dtype='float32')
-            
-            helper.append_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
+
+            # The output variable's dtype use default value 'float32',
+            # and the actual dtype of output variable will be inferred in runtime.
+            if in_dygraph_mode():
+                for out_name in out_names:
+                    outs[out_name] = VarBase()
+                _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
+            else:
+                helper = LayerHelper("{op_name}", **locals())
+                for out_name in out_names:
+                    outs[out_name] = helper.create_variable(dtype='float32')
+
+                helper.append_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs)
 
             res = [outs[out_name] for out_name in out_names]
 
@@ -781,13 +917,18 @@ def _get_api_inputs_str(op_name):
     in_names, out_names, attr_names = parse_op_info(op_name)
     # e.g: x, y, z
     param_names = in_names + attr_names
-    params_str = ','.join([p.lower() for p in param_names])
+    # NOTE(chenweihang): we add suffix `@VECTOR` for std::vector<Tensor> input,
+    # but the string contains `@` cannot used as argument name, so we split
+    # input name by `@`, and only use first substr as argument
+    params_str = ','.join([p.split("@")[0].lower() for p in param_names])
     # e.g: {'X': x, 'Y': y, 'Z': z}
-    ins_str = "{%s}" % ','.join(
-        ["'{}' : {}".format(in_name, in_name.lower()) for in_name in in_names])
+    ins_str = "{%s}" % ','.join([
+        "'{}' : {}".format(in_name, in_name.split("@")[0].lower())
+        for in_name in in_names
+    ])
     # e.g: {'num': n}
     attrs_str = "{%s}" % ",".join([
-        "'{}' : {}".format(attr_name, attr_name.lower())
+        "'{}' : {}".format(attr_name, attr_name.split("@")[0].lower())
         for attr_name in attr_names
     ])
     # e.g: ['Out', 'Index']
@@ -811,9 +952,7 @@ def _write_setup_file(name,
     import os
     from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup
     from paddle.utils.cpp_extension import get_build_directory
-    from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
 
-    use_new_custom_op_load_method({use_new_method})
 
     setup(
         name='{name}',
@@ -841,8 +980,7 @@ def _write_setup_file(name,
         extra_cxx_cflags=list2str(extra_cxx_cflags),
         extra_cuda_cflags=list2str(extra_cuda_cflags),
         extra_link_args=list2str(link_args),
-        build_dir=build_dir,
-        use_new_method=use_new_custom_op_load_method())
+        build_dir=build_dir)
 
     log_v('write setup.py into {}'.format(file_path), verbose)
     with open(file_path, 'w') as f:
@@ -898,11 +1036,7 @@ def parse_op_name_from(sources):
     """
 
     def regex(content):
-        if USING_NEW_CUSTOM_OP_LOAD_METHOD:
-            pattern = re.compile(r'PD_BUILD_OP\(([^,\)]+)\)')
-        else:
-            pattern = re.compile(r'REGISTER_OPERATOR\(([^,]+),')
-
+        pattern = re.compile(r'PD_BUILD_OP\(([^,\)]+)\)')
         content = re.sub(r'\s|\t|\n', '', content)
         op_name = pattern.findall(content)
         op_name = set([re.sub('_grad', '', name) for name in op_name])
@@ -950,20 +1084,20 @@ def check_abi_compatibility(compiler, verbose=False):
     if os.environ.get('PADDLE_SKIP_CHECK_ABI') in ['True', 'true', '1']:
         return True
 
-    which = 'where' if IS_WINDOWS else 'which'
-    cmd_out = subprocess.check_output(
-        [which, compiler], stderr=subprocess.STDOUT)
-    compiler_path = os.path.realpath(cmd_out.decode()
-                                     if six.PY3 else cmd_out).strip()
-    # step 1. if not found any suitable compiler, raise error
-    if not any(name in compiler_path
-               for name in _expected_compiler_current_platform()):
-        warnings.warn(
-            WRONG_COMPILER_WARNING.format(
-                user_compiler=compiler,
-                paddle_compiler=_expected_compiler_current_platform()[0],
-                platform=OS_NAME))
-        return False
+    if not IS_WINDOWS:
+        cmd_out = subprocess.check_output(
+            ['which', compiler], stderr=subprocess.STDOUT)
+        compiler_path = os.path.realpath(cmd_out.decode()
+                                         if six.PY3 else cmd_out).strip()
+        # if not found any suitable compiler, raise warning
+        if not any(name in compiler_path
+                   for name in _expected_compiler_current_platform()):
+            warnings.warn(
+                WRONG_COMPILER_WARNING.format(
+                    user_compiler=compiler,
+                    paddle_compiler=_expected_compiler_current_platform()[0],
+                    platform=OS_NAME))
+            return False
 
     version = (0, 0, 0)
     # clang++ have no ABI compatibility problem
@@ -1024,4 +1158,4 @@ def log_v(info, verbose=True):
     Print log information on stdout.
     """
     if verbose:
-        logging.info(info)
+        logger.info(info)
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index daa2826ca360f120cea5f0fd0afecc8dc40b0b7e..e3839d9767d21314e2c913f625a6359206e9147b 100755
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -19,6 +19,8 @@ import warnings
 import functools
 import paddle
 
+__all__ = []
+
 # NOTE(zhiqiu): Since python 3.2, DeprecationWarning is ignored by default,
 # and since python 3.7, it is once again shown by default when triggered directly by code in __main__.
 # See details: https://docs.python.org/3/library/warnings.html#default-warning-filter
@@ -28,7 +30,7 @@ import paddle
 warnings.simplefilter('default', DeprecationWarning)
 
 
-def deprecated(update_to="", since="", reason=""):
+def deprecated(update_to="", since="", reason="", level=0):
     """Decorate a function to signify its deprecation.
 
        This function wraps a method that will soon be removed and does two things:
@@ -37,9 +39,14 @@ def deprecated(update_to="", since="", reason=""):
            - Raises a :class:`~exceptions.DeprecatedWarning` when old API is called.
 
        Args:
-           since(str): The version at which the decorated method is considered deprecated.
-           update_to(str): The new API users should use.
-           reason(str): The reason why the API is deprecated.
+            since(str, optional): The version at which the decorated method is considered deprecated.
+            update_to(str, optional): The new API users should use.
+            reason(str, optional): The reason why the API is deprecated.
+            level(int, optional): The deprecated warning log level. It must be 
+                an Integer and must be one of 0, 1, 2. 
+                If `level == 0`, the warning message will not be showed. 
+                If `level == 1`, the warning message will be showed normally.
+                If `level == 2`, it will raise `RuntimeError`.
            
        Returns:
            decorator: decorated function or class.
@@ -52,6 +59,9 @@ def deprecated(update_to="", since="", reason=""):
         assert isinstance(update_to, str), 'type of "update_to" must be str.'
         assert isinstance(since, str), 'type of "since" must be str.'
         assert isinstance(reason, str), 'type of "reason" must be str.'
+        assert isinstance(level, int) and level >= 0 and level < 3, (
+            'type of "level" must be int and must be one of 0, 1, 2. But '
+            'received: {}.'.format(level))
 
         _since = since.strip()
         _update_to = update_to.strip()
@@ -69,12 +79,12 @@ def deprecated(update_to="", since="", reason=""):
                 update_to)
             msg += ' Please use "{}" instead.'.format(_update_to)
         if len(_reason) > 0:
-            msg += "\n reason: {}".format(_reason)
+            msg += "\nreason: {}".format(_reason)
         if func.__doc__:
             func.__doc__ = ('\n\nWarning: ' + msg + '\n') + func.__doc__
-        # TODO(Joejiong) Early returning the wrapper function, currently we disable the warning wrapper, 
-        # because the 2.0beta APIs are still under development, we will restore the warning functionality when 2.0 rc APIs become stable.
-        return func
+
+        if level == 0:
+            return func
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
@@ -83,13 +93,19 @@ def deprecated(update_to="", since="", reason=""):
                2. since version is empty, in this case, API is deprecated in all versions.
                3. current version is newer than since version.
             """
-            msg = "\033[93mWarning %s \033[0m" % (msg)
+
+            if level == 2:
+                raise RuntimeError('API "{}.{}" has been deprecated.'.format(
+                    func.__module__, func.__name__))
+
+            warningmsg = "\033[93m\nWarning:\n%s \033[0m" % (msg)
             v_current = [int(i) for i in paddle.__version__.split(".")]
             v_current += [0] * (4 - len(v_current))
             v_since = [int(i) for i in _since.split(".")]
             v_since += [0] * (4 - len(v_since))
             if paddle.__version__ == "0.0.0" or _since == "" or v_current >= v_since:
-                warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
+                warnings.warn(
+                    warningmsg, category=DeprecationWarning, stacklevel=2)
 
             return func(*args, **kwargs)
 
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index 3af9a83f6a212d91d05a787067f1beb5f7a6244a..3ad627ddea927467caaa1524285724850a5cdc36 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -61,44 +61,6 @@ WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
 
 DOWNLOAD_RETRY_LIMIT = 3
 
-nlp_models = OrderedDict((
-    ('RoBERTa-zh-base',
-     'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz'
-     ),
-    ('RoBERTa-zh-large',
-     'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz'
-     ),
-    ('ERNIE-v2-en-base',
-     'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'),
-    ('ERNIE-v2-en-large',
-     'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz'),
-    ('XLNet-cased-base',
-     'https://xlnet.bj.bcebos.com/xlnet_cased_L-12_H-768_A-12.tgz'),
-    ('XLNet-cased-large',
-     'https://xlnet.bj.bcebos.com/xlnet_cased_L-24_H-1024_A-16.tgz'),
-    ('ERNIE-v1-zh-base',
-     'https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz'),
-    ('ERNIE-v1-zh-base-max-len-512',
-     'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz'),
-    ('BERT-en-uncased-large-whole-word-masking',
-     'https://bert-models.bj.bcebos.com/wwm_uncased_L-24_H-1024_A-16.tar.gz'),
-    ('BERT-en-cased-large-whole-word-masking',
-     'https://bert-models.bj.bcebos.com/wwm_cased_L-24_H-1024_A-16.tar.gz'),
-    ('BERT-en-uncased-base',
-     'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz'),
-    ('BERT-en-uncased-large',
-     'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz'),
-    ('BERT-en-cased-base',
-     'https://bert-models.bj.bcebos.com/cased_L-12_H-768_A-12.tar.gz'),
-    ('BERT-en-cased-large',
-     'https://bert-models.bj.bcebos.com/cased_L-24_H-1024_A-16.tar.gz'),
-    ('BERT-multilingual-uncased-base',
-     'https://bert-models.bj.bcebos.com/multilingual_L-12_H-768_A-12.tar.gz'),
-    ('BERT-multilingual-cased-base',
-     'https://bert-models.bj.bcebos.com/multi_cased_L-12_H-768_A-12.tar.gz'),
-    ('BERT-zh-base',
-     'https://bert-models.bj.bcebos.com/chinese_L-12_H-768_A-12.tar.gz'), ))
-
 
 def is_url(path):
     """
@@ -155,7 +117,11 @@ def _get_unique_endpoints(trainer_endpoints):
     return unique_endpoints
 
 
-def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True):
     """ Download from given url to root_dir.
     if file or directory specified by url is exists under
     root_dir, return the path directly, otherwise download
@@ -190,7 +156,8 @@ def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
                 time.sleep(1)
 
     if ParallelEnv().current_endpoint in unique_endpoints:
-        if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath):
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
             fullpath = _decompress(fullpath)
 
     return fullpath
@@ -219,7 +186,15 @@ def _download(url, path, md5sum=None):
 
         logger.info("Downloading {} from {}".format(fname, url))
 
-        req = requests.get(url, stream=True)
+        try:
+            req = requests.get(url, stream=True)
+        except Exception as e:  # requests.exceptions.ConnectionError
+            logger.info(
+                "Downloading {} from {} failed {} times with exception {}".
+                format(fname, url, retry_cnt + 1, str(e)))
+            time.sleep(1)
+            continue
+
         if req.status_code != 200:
             raise RuntimeError("Downloading from {} failed with code "
                                "{}!".format(url, req.status_code))
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
index b113f574e9fac069ed065336b35102ff6a3a6255..18be9366c40a7c04e68c2d4d2eb6798ba082456a 100644
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
@@ -16,6 +16,8 @@ import numpy as np
 from PIL import Image
 from six.moves import cStringIO as StringIO
 
+__all__ = []
+
 
 def resize_image(img, target_size):
     """
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index 3b98680c89f25ed3733c424327ebf92657aa53a5..69baa4facfa96c3d64561697ba001be30319781d 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -20,7 +20,7 @@ import numpy as np
 
 import paddle
 
-__all__ = ['run_check']
+__all__ = []
 
 
 def _simple_network():
@@ -74,6 +74,34 @@ def _is_cuda_available():
         return False
 
 
+def _run_dygraph_single(use_cuda):
+    """
+    Testing the simple network in dygraph mode using one CPU/GPU.
+
+    Args:
+        use_cuda (bool): Whether running with CUDA.
+    """
+    paddle.disable_static()
+    if use_cuda:
+        paddle.set_device('gpu')
+    else:
+        paddle.set_device('cpu')
+    weight_attr = paddle.ParamAttr(
+        name="weight", initializer=paddle.nn.initializer.Constant(value=0.5))
+    bias_attr = paddle.ParamAttr(
+        name="bias", initializer=paddle.nn.initializer.Constant(value=1.0))
+    linear = paddle.nn.Linear(
+        2, 4, weight_attr=weight_attr, bias_attr=bias_attr)
+    input_np = _prepare_data(1)
+    input_tensor = paddle.to_tensor(input_np)
+    linear_out = linear(input_tensor)
+    out = paddle.tensor.sum(linear_out)
+    out.backward()
+    opt = paddle.optimizer.Adam(
+        learning_rate=0.001, parameters=linear.parameters())
+    opt.step()
+
+
 def _run_static_single(use_cuda):
     """
     Testing the simple network with executor running directly, using one CPU/GPU.
@@ -152,7 +180,11 @@ def run_check():
 
     print("Running verify PaddlePaddle program ... ")
 
-    use_cuda = _is_cuda_available()
+    if paddle.is_compiled_with_cuda():
+        use_cuda = _is_cuda_available()
+    else:
+        use_cuda = False
+
     if use_cuda:
         device_str = "GPU"
         device_list = paddle.static.cuda_places()
@@ -162,6 +194,7 @@ def run_check():
     device_count = len(device_list)
 
     _run_static_single(use_cuda)
+    _run_dygraph_single(use_cuda)
     print("PaddlePaddle works well on 1 {}.".format(device_str))
 
     try:
diff --git a/python/paddle/utils/lazy_import.py b/python/paddle/utils/lazy_import.py
index ea07077b2da2a93e3cc60e8f871396682ad41eb0..d9146422819f8aa2262fca89cf9e0dd673695b96 100644
--- a/python/paddle/utils/lazy_import.py
+++ b/python/paddle/utils/lazy_import.py
@@ -15,6 +15,8 @@
 
 import importlib
 
+__all__ = []
+
 
 def try_import(module_name):
     """Try importing a module, with an informative error message on failure."""
diff --git a/python/paddle/utils/op_version.py b/python/paddle/utils/op_version.py
index 68acc9de081518959b0016f8e7ec2064b6e01527..6e81b5a2c17bb143cd8c3fba4853c863a1df8db2 100644
--- a/python/paddle/utils/op_version.py
+++ b/python/paddle/utils/op_version.py
@@ -14,7 +14,7 @@
 
 from ..fluid import core
 
-__all__ = ['OpLastCheckpointChecker']
+__all__ = []
 
 
 def Singleton(cls):
diff --git a/python/paddle/utils/profiler.py b/python/paddle/utils/profiler.py
index 89c0d2cac68a9779bfde85bc87790b64fc38fbd6..cc33342ec5a51b4db514270b9169e843701656b0 100644
--- a/python/paddle/utils/profiler.py
+++ b/python/paddle/utils/profiler.py
@@ -18,9 +18,22 @@ import sys
 import warnings
 
 from ..fluid import core
-from ..fluid.profiler import *
-
-__all__ = ['ProfilerOptions', 'Profiler', 'get_profiler']
+from ..fluid.profiler import cuda_profiler  # noqa: F401
+from ..fluid.profiler import start_profiler
+from ..fluid.profiler import profiler  # noqa: F401
+from ..fluid.profiler import stop_profiler
+from ..fluid.profiler import reset_profiler
+
+__all__ = [     #noqa
+           'Profiler',
+           'get_profiler',
+           'ProfilerOptions',
+           'cuda_profiler',
+           'start_profiler',
+           'profiler',
+           'stop_profiler',
+           'reset_profiler'
+]
 
 
 class ProfilerOptions(object):
diff --git a/python/paddle/utils/unique_name.py b/python/paddle/utils/unique_name.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0d487c933d767d4f1dca9642d5346bf97b7fe06
--- /dev/null
+++ b/python/paddle/utils/unique_name.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.unique_name import generate  # noqa: F401
+from ..fluid.unique_name import switch  # noqa: F401
+from ..fluid.unique_name import guard  # noqa: F401
+
+__all__ = [  #noqa
+    'generate', 'switch', 'guard'
+]
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 06a55b718087e8a7136f89762967f9773b6fba41..718af041307a15e5c44ece79b478d7a47bf8729c 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -28,11 +28,14 @@ def has_valid_extension(filename, extensions):
 
     Args:
         filename (str): path to a file
-        extensions (tuple of str): extensions to consider (lowercase)
+        extensions (list[str]|tuple[str]): extensions to consider
 
     Returns:
         bool: True if the filename ends with one of given extensions
     """
+    assert isinstance(extensions,
+                      (list, tuple)), ("`extensions` must be list or tuple.")
+    extensions = tuple([x.lower() for x in extensions])
     return filename.lower().endswith(extensions)
 
 
@@ -73,7 +76,7 @@ class DatasetFolder(Dataset):
     Args:
         root (string): Root directory path.
         loader (callable|optional): A function to load a sample given its path.
-        extensions (tuple[str]|optional): A list of allowed extensions.
+        extensions (list[str]|tuple[str]|optional): A list of allowed extensions.
             both extensions and is_valid_file should not be passed.
         transform (callable|optional): A function/transform that takes in
             a sample and returns a transformed version.
@@ -226,7 +229,7 @@ class ImageFolder(Dataset):
     Args:
         root (string): Root directory path.
         loader (callable, optional): A function to load a sample given its path.
-        extensions (tuple[string], optional): A list of allowed extensions.
+        extensions (list[str]|tuple[str], optional): A list of allowed extensions.
             both extensions and is_valid_file should not be passed.
         transform (callable, optional): A function/transform that takes in
             a sample and returns a transformed version.
diff --git a/python/paddle/vision/image.py b/python/paddle/vision/image.py
index 3d5ea3a73af6cb4a97b6047fff9e090d8fb51441..19986816b7cc42282050057b5cc791faa8fd1c1f 100644
--- a/python/paddle/vision/image.py
+++ b/python/paddle/vision/image.py
@@ -80,9 +80,9 @@ def set_image_backend(backend):
             shutil.rmtree(temp_dir)
     """
     global _image_backend
-    if backend not in ['pil', 'cv2']:
+    if backend not in ['pil', 'cv2', 'tensor']:
         raise ValueError(
-            "Expected backend are one of ['pil', 'cv2'], but got {}"
+            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}"
             .format(backend))
     _image_backend = backend
 
@@ -150,13 +150,13 @@ def image_load(path, backend=None):
 
     if backend is None:
         backend = _image_backend
-    if backend not in ['pil', 'cv2']:
+    if backend not in ['pil', 'cv2', 'tensor']:
         raise ValueError(
-            "Expected backend are one of ['pil', 'cv2'], but got {}"
+            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}"
             .format(backend))
 
     if backend == 'pil':
         return Image.open(path)
-    else:
+    elif backend == 'cv2':
         cv2 = try_import('cv2')
         return cv2.imread(path)
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 079aa086f2b3be0abb0068eb5ede98f89b304d0c..60a7a90c9be89591e681192f5e886f9c5443a8c0 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -22,7 +22,10 @@ from ..fluid.initializer import Normal
 
 from paddle.common_ops_import import *
 
-__all__ = ['yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D']
+__all__ = [
+    'yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D', 'read_file',
+    'decode_jpeg'
+]
 
 
 def yolo_loss(x,
@@ -336,7 +339,7 @@ def yolo_box(x,
         import paddle
         import numpy as np
 
-	x = np.random.random([2, 14, 8, 8]).astype('float32')
+        x = np.random.random([2, 14, 8, 8]).astype('float32')
         img_size = np.ones((2, 2)).astype('int32')
 
         x = paddle.to_tensor(x)
@@ -454,13 +457,13 @@ def deform_conv2d(x,
             the number of output channels, g is the number of groups, kH is the filter's
             height, kW is the filter's width.
         bias (Tensor, optional): The bias with shape [M,].
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must
+        padding (int|list|tuple, optional): The padding size. If padding is a list/tuple, it must
             contain two integers, (padding_H, padding_W). Otherwise, the
             padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: dilation = 1.
         deformable_groups (int): The number of deformable group partitions.
@@ -644,13 +647,13 @@ class DeformConv2D(Layer):
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
         kernel_size(int|list|tuple): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain three integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. The default value is 1.
-        padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must
+        padding (int|list|tuple, optional): The padding size. If padding is a list/tuple, it must
             contain two integers, (padding_H, padding_W). Otherwise, the
             padding_H = padding_W = padding. Default: padding = 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         deformable_groups (int): The number of deformable group partitions.
@@ -782,3 +785,95 @@ class DeformConv2D(Layer):
             groups=self._groups,
             mask=mask)
         return out
+
+
+def read_file(filename, name=None):
+    """
+    Reads and outputs the bytes contents of a file as a uint8 Tensor
+    with one dimension.
+
+    Args:
+        filename (str): Path of the file to be read.
+        name (str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A uint8 tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import cv2
+            import paddle
+
+            fake_img = (np.random.random(
+                        (400, 300, 3)) * 255).astype('uint8')
+
+            cv2.imwrite('fake.jpg', fake_img)
+
+            img_bytes = paddle.vision.ops.read_file('fake.jpg')
+            
+            print(img_bytes.shape)
+
+    """
+
+    if in_dygraph_mode():
+        return core.ops.read_file('filename', filename)
+
+    inputs = dict()
+    attrs = {'filename': filename}
+
+    helper = LayerHelper("read_file", **locals())
+    out = helper.create_variable_for_type_inference('uint8')
+    helper.append_op(
+        type="read_file", inputs=inputs, attrs=attrs, outputs={"Out": out})
+
+    return out
+
+
+def decode_jpeg(x, mode='unchanged', name=None):
+    """
+    Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. 
+    Optionally converts the image to the desired format. 
+    The values of the output tensor are uint8 between 0 and 255.
+
+    Args:
+        x (Tensor): A one dimensional uint8 tensor containing the raw bytes 
+            of the JPEG image.
+        mode (str): The read mode used for optionally converting the image. 
+            Default: 'unchanged'.
+        name (str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns:
+        Tensor: A decoded image tensor with shape (imge_channels, image_height, image_width)
+
+    Examples:
+        .. code-block:: python
+            import cv2
+            import paddle
+
+            fake_img = (np.random.random(
+                        (400, 300, 3)) * 255).astype('uint8')
+
+            cv2.imwrite('fake.jpg', fake_img)
+
+            img_bytes = paddle.vision.ops.read_file('fake.jpg')
+            img = paddle.vision.ops.decode_jpeg(img_bytes)
+
+            print(img.shape)
+    """
+
+    if in_dygraph_mode():
+        return core.ops.decode_jpeg(x, "mode", mode)
+
+    inputs = {'X': x}
+    attrs = {"mode": mode}
+
+    helper = LayerHelper("decode_jpeg", **locals())
+    out = helper.create_variable_for_type_inference('uint8')
+    helper.append_op(
+        type="decode_jpeg", inputs=inputs, attrs=attrs, outputs={"Out": out})
+
+    return out
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index da90e4907e410a1a8587f33812c515f5106526fd..18a35915c99da505678a2ab836d21dd0ace56ee6 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -25,13 +25,6 @@ from PIL import Image
 from numpy import sin, cos, tan
 import paddle
 
-if sys.version_info < (3, 3):
-    Sequence = collections.Sequence
-    Iterable = collections.Iterable
-else:
-    Sequence = collections.abc.Sequence
-    Iterable = collections.abc.Iterable
-
 from . import functional_pil as F_pil
 from . import functional_cv2 as F_cv2
 from . import functional_tensor as F_t
@@ -83,14 +76,18 @@ def to_tensor(pic, data_format='CHW'):
             print(tensor.shape)
 
     """
-    if not (_is_pil_image(pic) or _is_numpy_image(pic)):
-        raise TypeError('pic should be PIL Image or ndarray. Got {}'.format(
-            type(pic)))
+    if not (_is_pil_image(pic) or _is_numpy_image(pic) or
+            _is_tensor_image(pic)):
+        raise TypeError(
+            'pic should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(pic)))
 
     if _is_pil_image(pic):
         return F_pil.to_tensor(pic, data_format)
-    else:
+    elif _is_numpy_image(pic):
         return F_cv2.to_tensor(pic, data_format)
+    else:
+        return pic if data_format.lower() == 'chw' else pic.transpose((1, 2, 0))
 
 
 def resize(img, size, interpolation='bilinear'):
@@ -135,13 +132,16 @@ def resize(img, size, interpolation='bilinear'):
             converted_img = F.resize(fake_img, (200, 150))
             print(converted_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.resize(img, size, interpolation)
+    elif _is_tensor_image(img):
+        return F_t.resize(img, size, interpolation)
     else:
         return F_cv2.resize(img, size, interpolation)
 
@@ -153,8 +153,8 @@ def pad(img, padding, fill=0, padding_mode='constant'):
     Args:
         img (PIL.Image|np.array): Image to be padded.
         padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
             this is the padding for the left, top, right and bottom borders
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
@@ -196,13 +196,16 @@ def pad(img, padding, fill=0, padding_mode='constant'):
             padded_img = F.pad(fake_img, padding=(2, 1))
             print(padded_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.pad(img, padding, fill, padding_mode)
+    elif _is_tensor_image(img):
+        return F_t.pad(img, padding, fill, padding_mode)
     else:
         return F_cv2.pad(img, padding, fill, padding_mode)
 
@@ -236,13 +239,16 @@ def crop(img, top, left, height, width):
             print(cropped_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.crop(img, top, left, height, width)
+    elif _is_tensor_image(img):
+        return F_t.crop(img, top, left, height, width)
     else:
         return F_cv2.crop(img, top, left, height, width)
 
@@ -272,13 +278,16 @@ def center_crop(img, output_size):
             cropped_img = F.center_crop(fake_img, (150, 100))
             print(cropped_img.size)
         """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.center_crop(img, output_size)
+    elif _is_tensor_image(img):
+        return F_t.center_crop(img, output_size)
     else:
         return F_cv2.center_crop(img, output_size)
 
@@ -307,13 +316,16 @@ def hflip(img):
             print(flpped_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.hflip(img)
+    elif _is_tensor_image(img):
+        return F_t.hflip(img)
     else:
         return F_cv2.hflip(img)
 
@@ -342,13 +354,16 @@ def vflip(img):
             print(flpped_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.vflip(img)
+    elif _is_tensor_image(img):
+        return F_t.vflip(img)
     else:
         return F_cv2.vflip(img)
 
@@ -538,10 +553,10 @@ def rotate(img,
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
             Note that the expand flag assumes rotation around the center and no translation.
-        center (2-tuple, optional): Optional center of rotation.
+        center (2-list|2-tuple, optional): Optional center of rotation.
             Origin is the upper left corner.
             Default is the center of the image.
-        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+        fill (3-list|3-tuple or int): RGB pixel fill value for area outside the rotated image.
             If int, it is used for all channels respectively.
 
 
@@ -563,13 +578,21 @@ def rotate(img,
             print(rotated_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
+    if isinstance(center, list):
+        center = tuple(center)
+    if isinstance(fill, list):
+        fill = tuple(fill)
+
     if _is_pil_image(img):
         return F_pil.rotate(img, angle, interpolation, expand, center, fill)
+    elif _is_tensor_image(img):
+        return F_t.rotate(img, angle, interpolation, expand, center, fill)
     else:
         return F_cv2.rotate(img, angle, interpolation, expand, center, fill)
 
@@ -601,13 +624,16 @@ def to_grayscale(img, num_output_channels=1):
             print(gray_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.to_grayscale(img, num_output_channels)
+    elif _is_tensor_image(img):
+        return F_t.to_grayscale(img, num_output_channels)
     else:
         return F_cv2.to_grayscale(img, num_output_channels)
 
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index d50ba7b23c74a501a65a4004dc745c0f4845954e..99cbfd6dc4f8dd195960b776864bc523bdca2c71 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -136,8 +136,8 @@ def pad(img, padding, fill=0, padding_mode='constant'):
     Args:
         img (np.array): Image to be padded.
         padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
             this is the padding for the left, top, right and bottom borders
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index 516c28f849915c3546a40bc4a7e962968ce56b23..eee60c5452b2de1235c577b2eabb8de1cfdc1467 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -141,8 +141,8 @@ def pad(img, padding, fill=0, padding_mode='constant'):
     Args:
         img (PIL.Image): Image to be padded.
         padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
             this is the padding for the left, top, right and bottom borders
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index e8b70820dd9af6534c39a0e52e6b5b9408056e8d..7f490d57916fbcb67475cd433b09771d13261128 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -14,11 +14,78 @@
 
 from __future__ import division
 
+import math
+import numbers
+
 import paddle
+import paddle.nn.functional as F
+
+import sys
+import collections
+
+
+def _assert_image_tensor(img, data_format):
+    if not isinstance(
+            img, paddle.Tensor) or img.ndim != 3 or not data_format.lower() in (
+                'chw', 'hwc'):
+        raise RuntimeError(
+            'not support [type={}, ndim={}, data_format={}] paddle image'.
+            format(type(img), img.ndim, data_format))
+
+
+def _get_image_h_axis(data_format):
+    if data_format.lower() == 'chw':
+        return -2
+    elif data_format.lower() == 'hwc':
+        return -3
+
+
+def _get_image_w_axis(data_format):
+    if data_format.lower() == 'chw':
+        return -1
+    elif data_format.lower() == 'hwc':
+        return -2
+
+
+def _get_image_c_axis(data_format):
+    if data_format.lower() == 'chw':
+        return -3
+    elif data_format.lower() == 'hwc':
+        return -1
+
+
+def _get_image_n_axis(data_format):
+    if len(data_format) == 3:
+        return None
+    elif len(data_format) == 4:
+        return 0
+
+
+def _is_channel_last(data_format):
+    return _get_image_c_axis(data_format) == -1
+
+
+def _is_channel_first(data_format):
+    return _get_image_c_axis(data_format) == -3
+
+
+def _get_image_num_batches(img, data_format):
+    if _get_image_n_axis(data_format):
+        return img.shape[_get_image_n_axis(data_format)]
+    return None
+
+
+def _get_image_num_channels(img, data_format):
+    return img.shape[_get_image_c_axis(data_format)]
+
+
+def _get_image_size(img, data_format):
+    return img.shape[_get_image_w_axis(data_format)], img.shape[
+        _get_image_h_axis(data_format)]
 
 
 def normalize(img, mean, std, data_format='CHW'):
-    """Normalizes a tensor image with mean and standard deviation.
+    """Normalizes a tensor image given mean and standard deviation.
 
     Args:
         img (paddle.Tensor): input data to be normalized.
@@ -31,10 +98,417 @@ def normalize(img, mean, std, data_format='CHW'):
         Tensor: Normalized mage.
 
     """
-    if data_format == 'CHW':
-        mean = paddle.to_tensor(mean).reshape([-1, 1, 1])
-        std = paddle.to_tensor(std).reshape([-1, 1, 1])
-    else:
-        mean = paddle.to_tensor(mean)
-        std = paddle.to_tensor(std)
+    _assert_image_tensor(img, data_format)
+
+    mean = paddle.to_tensor(mean, place=img.place)
+    std = paddle.to_tensor(std, place=img.place)
+
+    if _is_channel_first(data_format):
+        mean = mean.reshape([-1, 1, 1])
+        std = std.reshape([-1, 1, 1])
+
     return (img - mean) / std
+
+
+def to_grayscale(img, num_output_channels=1, data_format='CHW'):
+    """Converts image to grayscale version of image.
+
+    Args:
+        img (paddel.Tensor): Image to be converted to grayscale.
+        num_output_channels (int, optionl[1, 3]):
+            if num_output_channels = 1 : returned image is single channel
+            if num_output_channels = 3 : returned image is 3 channel 
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        paddle.Tensor: Grayscale version of the image.
+    """
+    _assert_image_tensor(img, data_format)
+
+    if num_output_channels not in (1, 3):
+        raise ValueError('num_output_channels should be either 1 or 3')
+
+    rgb_weights = paddle.to_tensor(
+        [0.2989, 0.5870, 0.1140], place=img.place).astype(img.dtype)
+
+    if _is_channel_first(data_format):
+        rgb_weights = rgb_weights.reshape((-1, 1, 1))
+
+    _c_index = _get_image_c_axis(data_format)
+
+    img = (img * rgb_weights).sum(axis=_c_index, keepdim=True)
+    _shape = img.shape
+    _shape[_c_index] = num_output_channels
+
+    return img.expand(_shape)
+
+
+def _affine_grid(theta, w, h, ow, oh):
+    d = 0.5
+    base_grid = paddle.ones((1, oh, ow, 3), dtype=theta.dtype)
+
+    x_grid = paddle.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, ow)
+    base_grid[..., 0] = x_grid
+    y_grid = paddle.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, oh).unsqueeze_(-1)
+    base_grid[..., 1] = y_grid
+
+    scaled_theta = theta.transpose(
+        (0, 2, 1)) / paddle.to_tensor([0.5 * w, 0.5 * h])
+    output_grid = base_grid.reshape((1, oh * ow, 3)).bmm(scaled_theta)
+
+    return output_grid.reshape((1, oh, ow, 2))
+
+
+def _grid_transform(img, grid, mode, fill):
+    if img.shape[0] > 1:
+        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2],
+                           grid.shape[3])
+
+    if fill is not None:
+        dummy = paddle.ones(
+            (img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype)
+        img = paddle.concat((img, dummy), axis=1)
+
+    img = F.grid_sample(
+        img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        mask = img[:, -1:, :, :]  # n 1 h w
+        img = img[:, :-1, :, :]  # n c h w
+        mask = mask.expand_as(img)
+        len_fill = len(fill) if isinstance(fill, (tuple, list)) else 1
+        fill_img = paddle.to_tensor(fill).reshape(
+            (1, len_fill, 1, 1)).expand_as(img)
+
+        if mode == 'nearest':
+            mask = paddle.cast(mask < 0.5, img.dtype)
+            img = img * (1. - mask) + mask * fill_img
+        else:  # 'bilinear'
+            img = img * mask + (1.0 - mask) * fill_img
+
+    return img
+
+
+def rotate(img,
+           angle,
+           interpolation='nearest',
+           expand=False,
+           center=None,
+           fill=None,
+           data_format='CHW'):
+    """Rotates the image by angle.
+
+    Args:
+        img (paddle.Tensor): Image to be rotated.
+        angle (float or int): In degrees degrees counter clockwise order.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest" 
+            - "bilinear"
+            - "bicubic"
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output image to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (2-tuple, optional): Optional center of rotation.
+            Origin is the upper left corner.
+            Default is the center of the image.
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        paddle.Tensor: Rotated image.
+
+    """
+
+    angle = -angle % 360
+    img = img.unsqueeze(0)
+
+    # n, c, h, w = img.shape
+    w, h = _get_image_size(img, data_format=data_format)
+
+    img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
+
+    post_trans = [0, 0]
+
+    if center is None:
+        rotn_center = [0, 0]
+    else:
+        rotn_center = [(p - s * 0.5) for p, s in zip(center, [w, h])]
+
+    angle = math.radians(angle)
+    matrix = [
+        math.cos(angle),
+        math.sin(angle),
+        0.0,
+        -math.sin(angle),
+        math.cos(angle),
+        0.0,
+    ]
+
+    matrix[2] += matrix[0] * (-rotn_center[0] - post_trans[0]) + matrix[1] * (
+        -rotn_center[1] - post_trans[1])
+    matrix[5] += matrix[3] * (-rotn_center[0] - post_trans[0]) + matrix[4] * (
+        -rotn_center[1] - post_trans[1])
+
+    matrix[2] += rotn_center[0]
+    matrix[5] += rotn_center[1]
+
+    matrix = paddle.to_tensor(matrix, place=img.place)
+    matrix = matrix.reshape((1, 2, 3))
+
+    if expand:
+        # calculate output size
+        corners = paddle.to_tensor(
+            [[-0.5 * w, -0.5 * h, 1.0], [-0.5 * w, 0.5 * h, 1.0],
+             [0.5 * w, 0.5 * h, 1.0], [0.5 * w, -0.5 * h, 1.0]],
+            place=matrix.place).astype(matrix.dtype)
+
+        _pos = corners.reshape(
+            (1, -1, 3)).bmm(matrix.transpose((0, 2, 1))).reshape((1, -1, 2))
+        _min = _pos.min(axis=-2).floor()
+        _max = _pos.max(axis=-2).ceil()
+
+        npos = _max - _min
+        nw = npos[0][0]
+        nh = npos[0][1]
+
+        ow, oh = int(nw.numpy()[0]), int(nh.numpy()[0])
+
+    else:
+        ow, oh = w, h
+
+    grid = _affine_grid(matrix, w, h, ow, oh)
+
+    out = _grid_transform(img, grid, mode=interpolation, fill=fill)
+
+    out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+
+    return out.squeeze(0)
+
+
+def vflip(img, data_format='CHW'):
+    """Vertically flips the given paddle tensor.
+
+    Args:
+        img (paddle.Tensor): Image to be flipped.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        paddle.Tensor:  Vertically flipped image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    h_axis = _get_image_h_axis(data_format)
+
+    return img.flip(axis=[h_axis])
+
+
+def hflip(img, data_format='CHW'):
+    """Horizontally flips the given paddle.Tensor Image.
+
+    Args:
+        img (paddle.Tensor): Image to be flipped.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        paddle.Tensor:  Horizontall flipped image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    w_axis = _get_image_w_axis(data_format)
+
+    return img.flip(axis=[w_axis])
+
+
+def crop(img, top, left, height, width, data_format='CHW'):
+    """Crops the given paddle.Tensor Image.
+
+    Args:
+        img (paddle.Tensor): Image to be cropped. (0,0) denotes the top left 
+            corner of the image.
+        top (int): Vertical component of the top left corner of the crop box.
+        left (int): Horizontal component of the top left corner of the crop box.
+        height (int): Height of the crop box.
+        width (int): Width of the crop box.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+    Returns:
+        paddle.Tensor: Cropped image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    if _is_channel_first(data_format):
+        return img[:, top:top + height, left:left + width]
+    else:
+        return img[top:top + height, left:left + width, :]
+
+
+def center_crop(img, output_size, data_format='CHW'):
+    """Crops the given paddle.Tensor Image and resize it to desired size.
+
+        Args:
+            img (paddle.Tensor): Image to be cropped. (0,0) denotes the top left corner of the image.
+            output_size (sequence or int): (height, width) of the crop box. If int,
+                it is used for both directions   
+            data_format (str, optional): Data format of img, should be 'HWC' or 
+                'CHW'. Default: 'CHW'.     
+        Returns:
+            paddle.Tensor: Cropped image.
+
+        """
+    _assert_image_tensor(img, data_format)
+
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+
+    image_width, image_height = _get_image_size(img, data_format)
+    crop_height, crop_width = output_size
+    crop_top = int(round((image_height - crop_height) / 2.))
+    crop_left = int(round((image_width - crop_width) / 2.))
+    return crop(
+        img,
+        crop_top,
+        crop_left,
+        crop_height,
+        crop_width,
+        data_format=data_format)
+
+
+def pad(img, padding, fill=0, padding_mode='constant', data_format='CHW'):
+    """
+    Pads the given paddle.Tensor on all sides with specified padding mode and fill value.
+
+    Args:
+        img (paddle.Tensor): Image to be padded.
+        padding (int|list|tuple): Padding on each border. If a single int is provided this
+            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            this is the padding for the left, top, right and bottom borders
+            respectively.
+        fill (float, optional): Pixel fill value for constant fill. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant. Default: 0. 
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value on the edge of the image
+
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+
+                       padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                       will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+
+                         padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                         will result in [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        paddle.Tensor: Padded image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    if not isinstance(padding, (numbers.Number, list, tuple)):
+        raise TypeError('Got inappropriate padding arg')
+    if not isinstance(fill, (numbers.Number, str, list, tuple)):
+        raise TypeError('Got inappropriate fill arg')
+    if not isinstance(padding_mode, str):
+        raise TypeError('Got inappropriate padding_mode arg')
+
+    if isinstance(padding, (list, tuple)) and len(padding) not in [2, 4]:
+        raise ValueError(
+            "Padding must be an int or a 2, or 4 element tuple, not a " +
+            "{} element tuple".format(len(padding)))
+
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
+        'Padding mode should be either constant, edge, reflect or symmetric'
+
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    elif len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    else:
+        pad_left = padding[0]
+        pad_top = padding[1]
+        pad_right = padding[2]
+        pad_bottom = padding[3]
+
+    padding = [pad_left, pad_right, pad_top, pad_bottom]
+
+    if padding_mode == 'edge':
+        padding_mode = 'replicate'
+    elif padding_mode == 'symmetric':
+        raise ValueError('Do not support symmetric mdoe')
+
+    img = img.unsqueeze(0)
+    #  'constant', 'reflect', 'replicate', 'circular'
+    img = F.pad(img,
+                pad=padding,
+                mode=padding_mode,
+                value=float(fill),
+                data_format='N' + data_format)
+
+    return img.squeeze(0)
+
+
+def resize(img, size, interpolation='bilinear', data_format='CHW'):
+    """
+    Resizes the image to given size
+
+    Args:
+        input (paddle.Tensor): Image to be resized.
+        size (int|list|tuple): Target size of input data, with (height, width) shape.
+        interpolation (int|str, optional): Interpolation method. when use paddle backend, 
+            support method are as following: 
+            - "nearest"  
+            - "bilinear"
+            - "bicubic"
+            - "trilinear"
+            - "area"
+            - "linear"
+        data_format (str, optional): paddle.Tensor format
+            - 'CHW'
+            - 'HWC'
+    Returns:
+        paddle.Tensor: Resized image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    if not (isinstance(size, int) or
+            (isinstance(size, (tuple, list)) and len(size) == 2)):
+        raise TypeError('Got inappropriate size arg: {}'.format(size))
+
+    if isinstance(size, int):
+        w, h = _get_image_size(img, data_format)
+        if (w <= h and w == size) or (h <= w and h == size):
+            return img
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+    else:
+        oh, ow = size
+
+    img = img.unsqueeze(0)
+    img = F.interpolate(
+        img,
+        size=(oh, ow),
+        mode=interpolation.lower(),
+        data_format='N' + data_format.upper())
+
+    return img.squeeze(0)
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index a244d44782963275ae1122ccac9b94707abde6b8..00e12689c4d9fe41e67798309fee42ce63d0f7a5 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -49,6 +49,8 @@ def _get_image_size(img):
         return img.size
     elif F._is_numpy_image(img):
         return img.shape[:2][::-1]
+    elif F._is_tensor_image(img):
+        return img.shape[1:][::-1]  # chw
     else:
         raise TypeError("Unexpected type {}".format(type(img)))
 
@@ -86,7 +88,7 @@ class Compose(object):
     together for a dataset transform.
 
     Args:
-        transforms (list): List of transforms to compose.
+        transforms (list|tuple): List/Tuple of transforms to compose.
 
     Returns:
         A compose object which is callable, __call__ for this Compose
@@ -104,7 +106,7 @@ class Compose(object):
 
             for i in range(10):
                 sample = flowers[i]
-                print(sample[0].shape, sample[1])
+                print(sample[0].size, sample[1])
 
     """
 
@@ -407,7 +409,8 @@ class RandomResizedCrop(BaseTransform):
 
     Args:
         size (int|list|tuple): Target size of output image, with (height, width) shape.
-        scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0)
+        scale (list|tuple): Scale range of the cropped image before resizing, relatively to the origin 
+            image. Default: (0.08, 1.0)
         ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
         interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. when use pil backend, 
             support method are as following: 
@@ -607,8 +610,8 @@ class Normalize(BaseTransform):
     ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
 
     Args:
-        mean (int|float|list): Sequence of means for each channel.
-        std (int|float|list): Sequence of standard deviations for each channel.
+        mean (int|float|list|tuple): Sequence of means for each channel.
+        std (int|float|list|tuple): Sequence of standard deviations for each channel.
         data_format (str, optional): Data format of img, should be 'HWC' or 
             'CHW'. Default: 'CHW'.
         to_rgb (bool, optional): Whether to convert to rgb. Default: False.
@@ -689,6 +692,9 @@ class Transpose(BaseTransform):
         self.order = order
 
     def _apply_image(self, img):
+        if F._is_tensor_image(img):
+            return img.transpose(self.order)
+
         if F._is_pil_image(img):
             img = np.asarray(img)
 
@@ -1021,11 +1027,11 @@ class Pad(BaseTransform):
 
     Args:
         padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
             this is the padding for the left, top, right and bottom borders
             respectively.
-        fill (int|list|tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+        fill (int|list|tuple): Pixel fill value for constant fill. Default is 0. If a list/tuple of
             length 3, it is used to fill R, G, B channels respectively.
             This value is only used when the padding_mode is constant
         padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
diff --git a/python/requirements.txt b/python/requirements.txt
index e89b3ede94fd4a624b3ddc335f5d2ea6e7b20b8a..609a4b34e8f1ae898b742034d75856642b65c1dd 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -7,5 +7,5 @@ gast>=0.3.3 ; platform_system != "Windows"
 gast==0.3.3 ; platform_system == "Windows"
 Pillow
 six
-decorator
+decorator==4.4.2
 astor
diff --git a/python/setup.py.in b/python/setup.py.in
index 0e214c5c65fbea4aa21103b71273fcaf976ae913..0f2e97192c1df1ab7a610dca4c2ceadfcb54ba40 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -145,16 +145,22 @@ packages=['paddle',
           'paddle.distributed',
           'paddle.incubate',
           'paddle.incubate.optimizer',
+          'paddle.incubate.checkpoint',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.meta_optimizers',
           'paddle.distributed.fleet.meta_optimizers.sharding',
+          'paddle.distributed.fleet.meta_optimizers.ascend',
+          'paddle.distributed.fleet.meta_optimizers.dygraph_optimizer',
           'paddle.distributed.fleet.runtime',
           'paddle.distributed.fleet.dataset',
           'paddle.distributed.fleet.data_generator',
           'paddle.distributed.fleet.metrics',
           'paddle.distributed.fleet.proto',
           'paddle.distributed.fleet.utils',
+          'paddle.distributed.fleet.meta_parallel',
+          'paddle.distributed.fleet.meta_parallel.pp_utils',
+          'paddle.distributed.fleet.meta_parallel.parallel_layers',
           'paddle.framework',
           'paddle.jit',
           'paddle.jit.dy2static',
@@ -172,13 +178,12 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
-          'paddle.fluid.contrib.reader',
           'paddle.fluid.contrib.slim',
           'paddle.fluid.contrib.slim.quantization',
           'paddle.fluid.contrib.slim.quantization.imperative',
-          'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.extend_optimizer',
           'paddle.fluid.contrib.mixed_precision',
+          'paddle.fluid.contrib.mixed_precision.bf16',
           'paddle.fluid.contrib.layers',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details',
@@ -215,6 +220,7 @@ packages=['paddle',
           'paddle.static.amp',
           'paddle.tensor',
           'paddle.onnx',
+          'paddle.autograd',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -249,11 +255,15 @@ paddle_bins = ''
 
 if not '${WIN32}':
     paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
+
+if os.name != 'nt':
+    package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.so']}
+else:
+    package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.pyd', '${FLUID_CORE_NAME}' + '.lib']}
+
 if '${HAS_NOAVX_CORE}' == 'ON':
     package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
 
-
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
     # The paddle.fluid.proto will be generated while compiling.
@@ -335,23 +345,17 @@ if '${WITH_XPU_BKCL}' == 'ON':
     shutil.copy('${XPU_BKCL_LIB}', libs_path)
     package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}']
 
-### Old custom op extension mechanism related, will be removed in 2.1.0 ###
-# copy libpaddle_framework.so to libs on linux
-if sys.platform.startswith('linux'):
-    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['libpaddle_framework.so']
+# Only for lite xpu inference.
+if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '':
+    xpu_api_lib = os.path.join('${XPU_SDK_ROOT}', 'XTDK/shlib/', 'libxpuapi.so')
+    xpu_rt_lib = os.path.join('${XPU_SDK_ROOT}', 'XTDK/runtime/shlib/', 'libxpurt.so')
+    if os.path.exists(xpu_api_lib):
+        shutil.copy(xpu_api_lib, libs_path)
+        package_data['paddle.libs']+=['libxpuapi.so']
+    if os.path.exists(xpu_rt_lib):
+        shutil.copy(xpu_rt_lib, libs_path)
+        package_data['paddle.libs']+=['libxpurt.so']
 
-### New custom op extension mechanism related ###
-# copy libpaddle_custom_op.so to libs on linux
-if sys.platform.startswith('linux'):
-    shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['libpaddle_custom_op.so']
-
-# copy paddle_custom_op.lib/paddle_custom_op.dll to libs on Windows
-if os.name == 'nt':
-    shutil.copy('${PADDLE_CUSTOM_OP_IMPORT_LIB}', libs_path)
-    shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['paddle_custom_op.lib', 'paddle_custom_op.dll']
 
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):
@@ -386,32 +390,22 @@ if os.name == 'nt':
 elif sys.platform == 'darwin':
     ext_modules = []
 
-def find_files(pattern, root):
+def find_files(pattern, root, recursive=False):
     for dirpath, _, files in os.walk(root):
-      for filename in fnmatch.filter(files, pattern):
-          yield os.path.join(dirpath, filename)
+        for filename in fnmatch.filter(files, pattern):
+            yield os.path.join(dirpath, filename)
+        if not recursive:
+            break
 
 headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/framework')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/imperative')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/memory')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) +
-    list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) +
-    list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage
-    ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen
-    list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen
-    list(find_files('*', '${GFLAGS_INSTALL_DIR}/include')) + # gflags
-    list(find_files('*', '${GLOG_INSTALL_DIR}/include')) + # glog
-    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')) + # boost
-    list(find_files('*', '${XXHASH_INSTALL_DIR}/include')) + # xxhash
-    list(find_files('*', '${PROTOBUF_INCLUDE_DIR}')) + # protobuf
-    list(find_files('*', '${DLPACK_INCLUDE_DIR}')) + # dlpack
-    list(find_files('*.h', '${THREADPOOL_INCLUDE_DIR}'))) # threadpool
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension/include')) +  # extension
+    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost', True)) + # boost
+    # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
+    # to `extension/incude`,
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -455,18 +449,14 @@ class InstallHeaders(Command):
         if 'pb.h' in header:
             install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
         elif 'third_party' not in header:
-            # framework
+            # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+            if 'fluid' in install_dir:
+                install_dir = "paddle/extension/include/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
-            patterns = ['eigen3/src/extern_eigen3', 'boost/src/extern_boost',
-                       'dlpack/src/extern_dlpack/include',
-                       'install/protobuf/include',
-                       'install/gflags/include',
-                       'install/glog/include', 'install/xxhash/include',
-                       'install/mkldnn/include',
-                       'threadpool/src/extern_threadpool']
+            patterns = ['boost/src/extern_boost', 'install/mkldnn/include']
             for pattern in patterns:
                 install_dir = re.sub(pattern, '', install_dir)
         install_dir = os.path.join(self.install_dir, os.path.dirname(install_dir))
@@ -507,12 +497,18 @@ else:
 
 # Log for PYPI
 if sys.version_info > (3,0):
-    with open("@PADDLE_BINARY_DIR@/python/paddle/README.md", "r", encoding='UTF-8') as f:
+    with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r", encoding='UTF-8') as f:
         long_description = f.read()
 else:
-    with open("@PADDLE_BINARY_DIR@/python/paddle/README.md", "r")as f:
+    with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r")as f:
         long_description = unicode(f.read(), 'UTF-8')
 
+# strip *.so to reduce package size
+if '${WITH_STRIP}' == 'ON':
+    command = 'find ${PADDLE_BINARY_DIR}/python/paddle -name "*.so" | xargs -i strip {}'
+    if os.system(command) != 0:
+        raise Exception("strip *.so failed, command: %s" % command)
+
 with redirect_stdout():
     setup(name='${PACKAGE_NAME}',
         version='${PADDLE_VERSION}',
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 5a59935887bbe46b3e8bbb970402de74542e7686..752f3545c69ccc14cfa2dd823e9dada783602db7 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -10,3 +10,4 @@ scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"
 scipy<=1.3.1 ; python_version=="3.5"
 scipy ; python_version>"3.5"
 prettytable
+distro
diff --git a/scripts/paddle b/scripts/paddle
new file mode 100644
index 0000000000000000000000000000000000000000..5f256ccf157910fa0c15447da5a5c65e23c0c223
--- /dev/null
+++ b/scripts/paddle
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+function version(){
+        echo "PaddlePaddle , compiled with"
+        echo "    with_avx: ON"
+        echo "    with_gpu: OFF"
+        echo "    with_mkl: ON"
+        echo "    with_mkldnn: "
+        echo "    with_python: ON"
+}
+
+function ver2num() {
+  set -e
+  # convert version to number.
+  if [ -z "$1" ]; then # empty argument
+    printf "%03d%03d%03d%03d%03d" 0
+  else
+    local VERN=$(echo $1 | sed 's#v##g' | sed 's#\.# #g' \
+        | sed 's#a# 0 #g' | sed 's#b# 1 #g' | sed 's#rc# 2 #g')
+    if [ `echo $VERN | wc -w` -eq 3 ] ; then
+      printf "%03d%03d%03d%03d%03d" $VERN 999 999
+    else
+      printf "%03d%03d%03d%03d%03d" $VERN
+    fi
+  fi
+  set +e
+}
+
+function cpu_config() {
+  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
+  # only when MKL enabled
+  if [ "ON" == "OFF" ]; then
+    return 0
+  fi
+  platform="`uname -s`"
+  ht=0
+  if [ $platform == "Linux" ]; then
+    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  elif [ $platform == "Darwin" ]; then
+    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
+      # HT is OFF
+      ht=1
+    fi
+  else
+    return 0
+  fi
+  if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="FALSE"
+    fi
+  else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="True"
+    fi
+  fi
+}
+
+function threads_config() {
+  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
+  # according to trainer_count and total processors
+  # only when MKL enabled
+  # auto set OPENBLAS_NUM_THREADS when do not use MKL
+  platform="`uname -s`"
+  processors=0
+  if [ $platform == "Linux" ]; then
+    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  elif [ $platform == "Darwin" ]; then
+    processors=`sysctl -n hw.logicalcpu`
+  else
+    return 0
+  fi
+  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
+  if [ -z $trainers ]; then
+    trainers=1
+  fi
+  threads=$((processors / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  if [ "ON" == "ON" ]; then
+    if [ -z "$OMP_NUM_THREADS" ]; then
+      export OMP_NUM_THREADS=$threads
+    fi
+    if [ -z "$MKL_NUM_THREADS" ]; then
+      export MKL_NUM_THREADS=$threads
+    fi
+  else
+    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
+      export OPENBLAS_NUM_THREADS=$threads
+    fi
+    if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then
+      export OPENBLAS_MAIN_FREE=1
+    fi
+  fi
+  
+}
+
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+mkdir -p ${PADDLE_CONF_HOME}
+
+if [ -z "${PADDLE_NO_STAT+x}" ]; then
+    SERVER_VER=`curl -m 5 -X POST --data content="{ \"version\": \"\" }"\
+        -b ${PADDLE_CONF_HOME}/paddle.cookie \
+        -c ${PADDLE_CONF_HOME}/paddle.cookie \
+        http://api.paddlepaddle.org/version 2>/dev/null`
+    if [ $? -eq 0 ] && [ "$(ver2num )" -lt  $(ver2num $SERVER_VER) ]; then
+      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org"
+    fi
+fi
+
+PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+if [ ! -z "${DEBUGGER}" ]; then
+    echo "Using debug command ${DEBUGGER}"
+fi
+
+CUDNN_LIB_PATH=""
+
+if [ ! -z "${CUDNN_LIB_PATH}" ]; then
+    export LD_LIBRARY_PATH=${CUDNN_LIB_PATH}:${LD_LIBRARY_PATH}
+fi
+
+export PYTHONPATH=${PWD}:${PYTHONPATH}
+
+
+# Check python lib installed or not.
+pip --help > /dev/null
+if [ $? -ne 0 ]; then
+    echo "pip should be installed to run paddle."
+    exit 1
+fi
+
+if [ "OFF" == "ON" ]; then
+    PADDLE_NAME="paddlepaddle-gpu"
+else 
+    PADDLE_NAME="paddlepaddle"
+fi
+
+INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'`
+
+if [ -z "${INSTALLED_VERSION}" ]; then
+   INSTALLED_VERSION="0.0.0"  # not installed
+fi
+cat <<EOF | python -
+from distutils.version import LooseVersion
+import sys
+if LooseVersion("${INSTALLED_VERSION}") < LooseVersion(""):
+  sys.exit(1)
+else:
+  sys.exit(0)
+EOF
+
+cpu_config
+# echo $KMP_AFFINITY $OMP_DYNAMIC
+
+case "$1" in
+    "version")
+        version
+        ;;
+    *)
+        version
+        ;;
+ esac
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index 2ea34771d1b38c9c190764646b61a6edf25cb607..7301e9954e910aaa117bec5222461feb3ad6aefa 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -16,32 +16,59 @@
 
 set +e
 set -x
+SYSTEM=`uname -s`
 if [ -z ${BRANCH} ]; then
     BRANCH="develop"
 fi
 
 export CI_SKIP_CPP_TEST=OFF
-PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
+fi
 CURDIR=`pwd`
 cd $PADDLE_ROOT
-cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    git remote | grep upstream
+    if [ $? != 0 ]; then 
+        git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+        git fetch upstream develop
+    fi
+fi
 CURBRANCH=`git rev-parse --abbrev-ref HEAD`
 echo $CURBRANCH
+if [ `git branch | grep 'prec_added_ut'` ];then
+    git branch -D 'prec_added_ut'
+fi
 git checkout -b prec_added_ut upstream/${BRANCH}
+git branch
 mkdir prec_build
 cd prec_build
-bash $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh cmake_gen_in_current_dir >prebuild.log 2>&1
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' > /$PADDLE_ROOT/br-ut
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    bash $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh cmake_gen_in_current_dir >prebuild.log 2>&1
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    bash $PADDLE_ROOT/win_cmake.sh >prec_build.log 2>&1
+fi
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut
 cd $PADDLE_ROOT/build
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' > /$PADDLE_ROOT/pr-ut
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut
 cd $PADDLE_ROOT
 grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut
-sort pr-ut |uniq -d > $PADDLE_ROOT/duplicate_ut
+if [[ "$SYSTEM" == 'Linux' ]];then
+    sort pr-ut |uniq -d > $PADDLE_ROOT/duplicate_ut
+fi
 echo "New-UT:"
 cat $PADDLE_ROOT/added_ut
 rm -rf prec_build
-rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
-git checkout $CURBRANCH
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/get_added_ut.sh
+fi
+git checkout -f $CURBRANCH
 echo $CURBRANCH
 git branch -D prec_added_ut
 cd $CURDIR
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 1db3f6d3d27ec356a78946ceb7e14825a5c40526..eb05468eda6cad3666277938729acaa174a13725 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 if [ -z ${BRANCH} ]; then
     BRANCH="develop"
 fi
@@ -61,8 +75,8 @@ DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_gr
 PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_PR.spec
 ADDED_OP_USE_DEFAULT_GRAD_MAKER=`python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py ${DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC} ${PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC}` 
 if [ "${ADDED_OP_USE_DEFAULT_GRAD_MAKER}" != "" ]; then
-  echo_line="You must have one RD (sneaxiy (Recommend) or luotao1) approval because you use DefaultGradOpMaker for ${ADDED_OP_USE_DEFAULT_GRAD_MAKER}, which manages the grad_op memory optimization.\n" 
-  check_approval 1 32832641 6836917
+  echo_line="You must have one RD (zhiqiu (Recommend) or zhhsplendid) approval because you use DefaultGradOpMaker for ${ADDED_OP_USE_DEFAULT_GRAD_MAKER}, which manages the grad_op memory optimization.\n" 
+  check_approval 1 6888866 7913861
 fi
 
 if [ -n "${echo_list}" ];then
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index fd3175a5729dad1a2454d2eaf8f9a6883b41db59..b1395c28878e3a5d52ac081d7926dfbd1863530c 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -53,6 +53,7 @@ API_FILES=("CMakeLists.txt"
            "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py"
            "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py"
            "tools/wlist.json"
+           "tools/sampcd_processor.py"
            "paddle/scripts/paddle_build.bat"
            "tools/windows/run_unittests.sh"
            "tools/parallel_UT_rule.py"
@@ -79,6 +80,13 @@ function add_failed(){
     echo_list="${echo_list[@]}$1"
 }
 
+function run_test_sampcd_processor() {
+    CUR_PWD=$(pwd)
+    cd ${PADDLE_ROOT}/tools
+    python test_sampcd_processor.py
+    python test_print_signatures.py
+    cd ${CUR_PWD}
+}
 
 if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
     echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n"
@@ -92,11 +100,11 @@ for API_FILE in ${API_FILES[*]}; do
       # You can use http://caius.github.io/github_id/ to find Github user id.
       # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676, Dong Daxiang 35550832, phlrain 43953930.
       if [ "${API_FILE}" == "CMakeLists.txt" ];then
-          echo_line="You must have one RD (luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
-          check_approval 1 6836917 46782768
+          echo_line="You must have one RD (wanghuancoder, luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n"
+          check_approval 1 6836917 46782768 26922892
       elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then
-          echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
-          check_approval 1 6836917 47554610
+          echo_line="You must have one RD (lanxianghit (Recommend), phlrain or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n"
+          check_approval 1 6836917 47554610 43953930
       elif [ "${API_FILE}" == "python/requirements.txt" ];then
           echo_line="You must have one RD (phlrain) and one TPM (swtkiwi) and one QA (kolinwei) approval for python/requirements.txt, which manages the third-party python package.\n"
           check_approval 3 43953930 27208573 22165420
@@ -104,8 +112,8 @@ for API_FILE in ${API_FILES[*]}; do
           echo_line="You must have one RD (gongweibao or seiriosPlus) approval for the paddle/fluid/operators/distributed/send_recv.proto.in, which manages the environment variables.\n"
           check_approval 1 10721757 5442383
       elif [ "${API_FILE}" == "paddle/fluid/framework/unused_var_check.cc" ];then
-          echo_line="You must have one RD (zhiqiu (Recommend) or luotao1) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n"
-          check_approval 1 6888866 6836917
+          echo_line="You must have one RD (zhiqiu (Recommend) or chenwhql) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n"
+          check_approval 1 6888866 22561442
       elif [ "${API_FILE}" == "paddle/fluid/pybind/op_function_generator.cc" ];then
           echo_line="You must have one RD (zhiqiu (Recommend) , phlrain) approval for the changes of paddle/fluid/pybind/op_function_generator.cc, which manages the logic of automatic generating op functions for dygraph. \n"
           check_approval 1 6888866 43953930
@@ -122,20 +130,23 @@ for API_FILE in ${API_FILES[*]}; do
           echo_line="You must have one RD (cryoco (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py, which manages the white list of setting no_check_set of check_output. \n"
           check_approval 1 12407750 6836917 43953930
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py" ]; then
-          echo_line="You must have one RD (luotao1, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n"
-          check_approval 1 6836917 43953930
+          echo_line="You must have one RD (luotao1, lanxianghit, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n"
+          check_approval 1 6836917 43953930 47554610
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py" ];then
           echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py, which manages the white list of error threshold for op test with float64 precision. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n"
           check_approval 1 52520497 26615455 6836917
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py" ];then
-          echo_line="You must have one RD (luotao1 or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
-          check_approval 1 6836917 43953930
+          echo_line="You must have one RD (luotao1, lanxianghit or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n"
+          check_approval 1 6836917 43953930 47554610
       elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then
           echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n"
           check_approval 1 39303645 6836917 43953930
       elif [ "${API_FILE}" == "tools/wlist.json" ];then
           echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n"
           check_approval 1 29231
+      elif [ "${API_FILE}" == "tools/sampcd_processor.py" ];then
+          echo_line="test_sampcd_processor.py will be executed for changed sampcd_processor.py.\n"
+          run_test_sampcd_processor
       elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then
 	      echo_line="You must have (fuyinno4 (Recommend), raindrops2sea) approval for ${API_FILE} changes"
 	      check_approval 1 35824027 38231817
@@ -143,17 +154,17 @@ for API_FILE in ${API_FILES[*]}; do
 	      echo_line="You must have (fuyinno4 (Recommend), raindrops2sea) approval for ${API_FILE} changes"
 	      check_approval 1 35824027 38231817
       elif [ "${API_FILE}" == "paddle/scripts/paddle_build.bat" ] || [ "${API_FILE}" == "tools/windows/run_unittests.sh" ]; then
-	      echo_line="You must have one RD (zhouwei25 (Recommend), luotao1) approval for ${API_FILE} changes, which manages the Paddle CI task on Windows.\n"
-	      check_approval 1 52485244 6836917
+	      echo_line="You must have one RD (zhouwei25 (Recommend), wanghuancoder, luotao1) approval for ${API_FILE} changes, which manages the Paddle CI task on Windows.\n"
+	      check_approval 1 52485244 6836917 26922892
       elif [ "${API_FILE}" == "tools/parallel_UT_rule.py" ]; then
-	      echo_line="You must have one RD (zhouwei25 (Recommend), luotao1) approval for ${API_FILE} changes, which manages the rule of running unittest with a same GPU. If the unittest failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, you can remove it from ${API_FILE}.\n"
-	      check_approval 1 52485244 6836917
+	      echo_line="You must have one RD (zhouwei25 (Recommend), wanghuancoder, luotao1) approval for ${API_FILE} changes, which manages the rule of running unittest with a same GPU. If the unittest failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, you can remove it from ${API_FILE}.\n"
+	      check_approval 1 52485244 6836917 26922892
       elif [ "${API_FILE}" == "python/paddle/fluid/parallel_executor.py" ]; then
           echo_line="You must have one RD (Xreki,luotao1,zhhsplendid) approval for ${API_FILE}, which manages the underlying code for PaddlePaddle.\n"
           check_approval 1 12538138 6836917 7913861
       else
-          echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
-          check_approval 1 46782768 12538138 6836917
+          echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n"
+          check_approval 1 46782768 12538138 6836917 22561442 6888866
       fi
   fi
 done
@@ -161,8 +172,8 @@ done
 FILTER=`git diff --name-only upstream/develop | grep -v "tools/"`
 HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH $FILTER | grep '^\+' | grep -o -m 1 "const_cast" || true`
 if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for the usage of const_cast.\n"
-    check_approval 1 46782768 12538138 6836917
+    echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1) approval for the usage of const_cast.\n"
+    check_approval 1 46782768 12538138 6836917 22561442 6888866
 fi
 
 HAS_BOOST_GET=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "boost::get" || true`
@@ -185,14 +196,14 @@ fi
 
 HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH | grep "^+[[:space:]]\{0,\}@unittest.skip" || true`
 if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
-    check_approval 1 22165420 6836917 46661762
+    echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n"
+    check_approval 1 22165420 6836917 46661762 26922892
   fi
 
 HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true`
 if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (Superjomn (Recommend), luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n"
-    check_approval 1 328693 6836917
+    echo_line="You must have one RD (Superjomn (Recommend), Shixiaowei02, luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n"
+    check_approval 1 328693 6836917 39303645
   fi
 
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
diff --git a/tools/check_sequence_op.sh b/tools/check_sequence_op.sh
index ada96750eaad80bc04b094e2cbb27695053871d0..a263b046b258b19f81d63dab593f56a92d155596 100644
--- a/tools/check_sequence_op.sh
+++ b/tools/check_sequence_op.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
 
 function check_sequnece_op_unitests(){
diff --git a/tools/count_api_without_core_ops.py b/tools/count_api_without_core_ops.py
index 99e84074158ad7ddbdb91148b53cc3433f03f3f8..664b94a059f5c8adbe541615e5472b43ad256915 100644
--- a/tools/count_api_without_core_ops.py
+++ b/tools/count_api_without_core_ops.py
@@ -22,6 +22,7 @@ import pydoc
 import hashlib
 import six
 import functools
+import platform
 
 __all__ = ['get_apis_with_and_without_core_ops', ]
 
@@ -34,9 +35,20 @@ omitted_list = [
 
 
 def md5(doc):
-    hash = hashlib.md5()
-    hash.update(str(doc).encode('utf-8'))
-    return hash.hexdigest()
+    try:
+        hashinst = hashlib.md5()
+        if platform.python_version()[0] == "2":
+            hashinst.update(str(doc))
+        else:
+            hashinst.update(str(doc).encode('utf-8'))
+        md5sum = hashinst.hexdigest()
+    except UnicodeDecodeError as e:
+        md5sum = None
+        print(
+            "Error({}) occurred when `md5({})`, discard it.".format(
+                str(e), doc),
+            file=sys.stderr)
+    return md5sum
 
 
 def split_with_and_without_core_ops(member, cur_name):
diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py
index 39fa3509cb86eb51d16abbcabcf5b12cb99df1bb..12bd04a6907ea9bb83b5c25daf9c74f28982724d 100644
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
@@ -18,6 +18,7 @@
 
 import os
 import sys
+import time
 
 from github import Github
 
@@ -33,7 +34,18 @@ def get_pull(pull_id):
     """
     token = os.getenv('GITHUB_API_TOKEN')
     github = Github(token, timeout=60)
-    repo = github.get_repo('PaddlePaddle/Paddle')
+    idx = 1
+    while idx < 4:
+        try:
+            repo = github.get_repo('PaddlePaddle/Paddle')
+        except Exception as e:
+            print(e)
+            print("get_repo error, retry {} times after {} secs.".format(
+                idx, idx * 10))
+        else:
+            break
+        idx += 1
+        time.sleep(idx * 10)
     pull = repo.get_pull(pull_id)
 
     return pull
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index fd28d939bd1cc74bdfc34ae8f1c8885ba7e393ed..7fb32040e795ca7ab2369900e7ceefb5c4808836 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -33,7 +33,7 @@ make install
 
 cd /paddle/build
 
-python3.7 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
+python3.7 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID} || exit 101
 
 lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
 
diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh
index 3c0e57ffe7ec1f0ab61c2baf25d85fbe8a0fae94..66e56b8485d8c6d40937bf821c1889424da33527 100644
--- a/tools/cudaError/start.sh
+++ b/tools/cudaError/start.sh
@@ -1,4 +1,19 @@
 #!/usr/bin/env bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -ex
 SYSTEM=`uname -s`
 rm -f protoc-3.11.3-linux-x86_64.*
diff --git a/tools/diff_api.py b/tools/diff_api.py
index 8a2acbb3d0acc75da230f724777a13d01b45f8b5..f086598945afe4f828d15615d50c6796bedca3d9 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function
 import difflib
 import sys
diff --git a/tools/diff_unittest.py b/tools/diff_unittest.py
index 382fbdd0b0c29f89a3ef080a7877d90f4333b71d..fa70be0990ec095cb76798346e4b110cfed9cd01 100644
--- a/tools/diff_unittest.py
+++ b/tools/diff_unittest.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import difflib
 import sys
 
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 108d2e5705c529fee2cc9dc744caa491cd612274..813781b5e79cec8bfc0756d353d147a3eb878916 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -19,6 +19,7 @@ RUN bash build_scripts/build.sh
 RUN bash build_scripts/install_nccl2.sh 
 RUN bash build_scripts/install_trt.sh 
 RUN rm -rf build_scripts
+RUN ln -s /usr/local/ssl/include/openssl /usr/include
 
 # git 2.17.1
 RUN wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \
@@ -47,28 +48,22 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p
 
 RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install setuptools -U
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/Dockerfile.rocm b/tools/dockerfile/Dockerfile.rocm
index eab4ef07c877897d0acaa4311182f34acc8ebc29..5df66b9ea633aa38fbe859e612c2db4dfed6fb31 100644
--- a/tools/dockerfile/Dockerfile.rocm
+++ b/tools/dockerfile/Dockerfile.rocm
@@ -5,7 +5,6 @@
 # Build: ROCM 4.0.1
 # cd Paddle/tools/dockerfile
 # docker build -f Dockerfile.rocm  \
-#        --build-arg ROCM_VERSION=4.0.1  \
 #        -t paddlepaddle/paddle-centos-rocm401-dev:latest .
 #
 # docker run -it --device=/dev/kfd --device=/dev/dri \
@@ -22,7 +21,7 @@ ENV LANGUAGE en_US.UTF-8
 RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \
         zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \
         make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel \
-        net-tools numactl-devel chrpath
+        net-tools numactl-devel chrpath screen initscripts
 
 # Install devtoolset-7
 RUN yum install -y yum-utils centos-release-scl && \
@@ -45,11 +44,10 @@ RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && \
 ENV PATH=/opt/cmake-3.16/bin:${PATH}
 
 # ROCM
-ARG ROCM_VERSION
 RUN yum install -y kmod wget openblas-devel epel-release
 RUN echo "[ROCm]" > /etc/yum.repos.d/rocm.repo && \
     echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo && \
-    echo "baseurl=http://repo.radeon.com/rocm/yum/${ROCM_VERSION}" >> /etc/yum.repos.d/rocm.repo && \
+    echo "baseurl=http://repo.radeon.com/rocm/yum/4.0.1" >> /etc/yum.repos.d/rocm.repo && \
     echo "enabled=1" >> /etc/yum.repos.d/rocm.repo && \
     echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo
 RUN yum install -y rocm-dev rocm-utils rocfft miopen-hip rocblas hipsparse rocrand rccl hipcub rocthrust rocprofiler-dev roctracer-dev
@@ -89,10 +87,14 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p
     cd .. && rm -f protobuf-cpp-3.6.1.tar.gz && rm -rf protobuf-3.6.1
 
 # conda
-RUN cd /opt && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && chmod +x Miniconda3-latest-Linux-x86_64.sh
-RUN mkdir /opt/conda && ./Miniconda3-latest-Linux-x86_64.sh -b -f -p "/opt/conda" && rm -rf Miniconda3-latest-Linux-x86_64.sh
+ENV CONDA_FILE=Miniconda3-py37_4.9.2-Linux-x86_64.sh
+RUN cd /opt && wget https://repo.anaconda.com/miniconda/${CONDA_FILE} && chmod +x ${CONDA_FILE}
+RUN mkdir /opt/conda && ./${CONDA_FILE} -b -f -p "/opt/conda" && rm -rf ${CONDA_FILE}
 ENV PATH=/opt/conda/bin:${PATH}
-RUN conda init bash && conda install -n base jupyter 
+RUN conda init bash && conda install -n base jupyter jupyterlab
+
+# install pylint and pre-commit
+RUN /opt/conda/bin/pip install pre-commit pylint pytest astroid isort protocol PyGithub
 
 # install Paddle requirement
 RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index d68992717c5126b4ddaa9e4a38dd728d8bf1f8c9..78a8b14027900cb9b24b92fb8a3dcaded879e3df 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -56,7 +56,7 @@ RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-a
 # Install Python3.7
 RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \
     tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7.0 --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
 
 # Install Python3.8
@@ -65,16 +65,17 @@ RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
     CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
 
-# Install Python3.5
-RUN wget -q https://www.python.org/ftp/python/3.5.1/Python-3.5.1.tgz && \
-    tar -xzf Python-3.5.1.tgz && cd Python-3.5.1 && \
-    CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.5.1 --enable-shared > /dev/null && \
+# Install Python3.9
+RUN wget -q https://www.python.org/ftp/python/3.9.0/Python-3.9.0.tgz && \
+    tar -xzf Python-3.9.0.tgz && cd Python-3.9.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
-ENV PATH=/usr/local/python3.5.1/include:${PATH}
-ENV PATH=/usr/local/python3.5.1/bin:${PATH}
-ENV LD_LIBRARY_PATH=/usr/local/python3.5.1/lib:${LD_LIBRARY_PATH}
-ENV CPLUS_INCLUDE_PATH=/usr/local/python3.5.1/include/python3.5:$CPLUS_INCLUDE_PATH
-RUN ln -sf /usr/local/python3.5.1/bin/python3.5 /usr/local/bin/python3 && ln -sf /usr/local/python3.5.1/bin/python3.5 /usr/bin/python3
+
+ENV PATH=/usr/local/python3.7.0/include:${PATH}
+ENV PATH=/usr/local/python3.7.0/bin:${PATH}
+ENV LD_LIBRARY_PATH=/usr/local/python3.7.0/lib:${LD_LIBRARY_PATH}
+ENV CPLUS_INCLUDE_PATH=/usr/local/python3.7.0/include/python3.7:$CPLUS_INCLUDE_PATH
+RUN ln -sf /usr/local/python3.7.0/bin/python3.7 /usr/local/bin/python3 && ln -sf /usr/local/python3.7.0/bin/python3.7 /usr/bin/python3
 
 RUN rm -r /root/python_build
 
@@ -97,19 +98,31 @@ RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e44
 WORKDIR /home/setuptools-40.6.2
 RUN python setup.py build && python setup.py install
 WORKDIR /home
-RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz && tar -zxvf pip-18.0.tar.gz
-WORKDIR pip-18.0
+RUN python3.9 -m pip uninstall -y pip setuptools && \
+  python3.8 -m pip uninstall -y pip setuptools && \
+  python3.7 -m pip uninstall -y pip setuptools && \
+  python3.6 -m pip uninstall -y pip setuptools
+RUN wget https://files.pythonhosted.org/packages/a7/e0/30642b9c2df516506d40b563b0cbd080c49c6b3f11a70b4c7a670f13a78b/setuptools-50.3.2.zip && apt-get -y install unzip && unzip setuptools-50.3.2.zip
+WORKDIR /home/setuptools-50.3.2
+RUN python3.9 setup.py build && python3.9 setup.py install && \
+  python3.8 setup.py build && python3.8 setup.py install && \
+  python3.7 setup.py build && python3.7 setup.py install && \
+  python3.6 setup.py build && python3.6 setup.py install
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
+WORKDIR pip-20.0.1
 RUN python setup.py install && \
+  python3.9 setup.py install && \
   python3.8 setup.py install && \
   python3.7 setup.py install && \
-  python3.6 setup.py install && \
-  python3 setup.py install 
+  python3.6 setup.py install
 
 WORKDIR /home
-RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \
-    rm -r Python-$version setuptools-40.6.2 pip-18.0
+RUN rm Python-$version.tgz setuptools-40.6.2.zip setuptools-50.3.2.zip pip-20.0.1.tar.gz && \
+    rm -r Python-$version setuptools-40.6.2 setuptools-50.3.2 pip-20.0.1
 
 # Install Go and glide
+WORKDIR /home
 RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
     tar -xz -C /usr/local && \
     mkdir /root/gopath && \
@@ -143,10 +156,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
-    pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
-    pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+RUN pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
@@ -155,48 +165,54 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.8 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.8 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.8 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3.9 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3.9 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.9 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
 
-RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+RUN pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.9 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.9 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0'
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0'
 
 #For docstring checker
-RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
-    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort && \
     pip3.7 --no-cache-dir install pylint pytest astroid isort && \
     pip3.8 --no-cache-dir install pylint pytest astroid isort && \
-    pip --no-cache-dir install pylint pytest astroid isort LinkChecker
+    pip3.9 --no-cache-dir install pylint pytest astroid isort && \
+    pip --no-cache-dir install pylint pytest astroid isort
 
-RUN pip3 --no-cache-dir install coverage && \
-    pip3.6 --no-cache-dir install coverage && \
+RUN pip3.6 --no-cache-dir install coverage && \
     pip3.7 --no-cache-dir install coverage && \
     pip3.8 --no-cache-dir install coverage && \
+    pip3.9 --no-cache-dir install coverage && \
     pip --no-cache-dir install coverage
 
 COPY ./python/requirements.txt /root/
-RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
-    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
     pip3.7 --no-cache-dir install -r /root/requirements.txt && \
     pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.9 --no-cache-dir install -r /root/requirements.txt && \
     pip --no-cache-dir install -r /root/requirements.txt
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y && \
-    pip3 --no-cache-dir install certifi urllib3[secure] && \
+    pip3.6 install --upgrade pip==20.3.3 && \ 
+    pip3.7 install --upgrade pip && \ 
+    pip3.8 install --upgrade pip && \ 
+    pip3.9 install --upgrade pip && \ 
     pip3.6 --no-cache-dir install certifi urllib3[secure] && \
     pip3.7 --no-cache-dir install certifi urllib3[secure] && \
     pip3.8 --no-cache-dir install certifi urllib3[secure] && \
+    pip3.9 --no-cache-dir install certifi urllib3[secure] && \
     pip --no-cache-dir install certifi urllib3[secure]
 
 # ar mishandles 4GB files
diff --git a/tools/dockerfile/Dockerfile.ubuntu18 b/tools/dockerfile/Dockerfile.ubuntu18
index d6c4753e7467511555b7b30dd88e5d31766a3ee3..a4a445e6db214dadb97038e7ab5ed8e09f4eba7a 100644
--- a/tools/dockerfile/Dockerfile.ubuntu18
+++ b/tools/dockerfile/Dockerfile.ubuntu18
@@ -11,7 +11,7 @@ ARG WITH_AVX
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV LD_LIBRARY_PATH=/usr/local/cuda-11.0/targets/x86_64-linux/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda-11.2/targets/x86_64-linux/lib:$LD_LIBRARY_PATH
 
 ENV HOME /root
 # Add bash enhancements
@@ -36,27 +36,48 @@ RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc
 RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
 ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
 
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+
 RUN apt-get update && \
   apt-get install -y python2.7 python2.7-dev \
-  python3.5 python3.5-dev \
   python3.6 python3.6-dev \
   python3.7 python3.7-dev \
-  python3.8 python3.8-dev python3.8-distutils && \
-  curl https://bootstrap.pypa.io/2.7/get-pip.py -o - | python2.7 && easy_install pip && \
-  curl https://bootstrap.pypa.io/3.5/get-pip.py -o - | python3.5 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.7 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.8 && easy_install pip && \
+  python3.8 python3.8-dev python3.8-distutils \
+  python3.9 python3.9-dev python3.9-distutils && \
   rm /usr/bin/python && ln -s /usr/bin/python2.7 /usr/bin/python && \
-  rm /usr/bin/python3 && ln -s /usr/bin/python3.5 /usr/bin/python3 && \
-  rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
-  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.5 /usr/local/bin/pip3
+  rm /usr/bin/python3 && ln -s /usr/bin/python3.7 /usr/bin/python3
 
 
-# install cmake
 WORKDIR /home
-RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
-ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+RUN wget https://files.pythonhosted.org/packages/a7/e0/30642b9c2df516506d40b563b0cbd080c49c6b3f11a70b4c7a670f13a78b/setuptools-50.3.2.zip && apt-get -y install unzip && unzip setuptools-50.3.2.zip
+WORKDIR /home/setuptools-50.3.2
+RUN python3.9 setup.py build && python3.9 setup.py install && \
+  python3.8 setup.py build && python3.8 setup.py install && \
+  python3.7 setup.py build && python3.7 setup.py install && \
+  python3.6 setup.py build && python3.6 setup.py install
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip && apt-get -y install unzip && unzip setuptools-40.6.2.zip
+WORKDIR /home/setuptools-40.6.2
+RUN python setup.py build && python setup.py install
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
+WORKDIR pip-20.0.1
+RUN python setup.py install && \
+  python3.9 setup.py install && \
+  python3.8 setup.py install && \
+  python3.7 setup.py install && \
+  python3.6 setup.py install
+
+WORKDIR /home
+RUN rm setuptools-40.6.2.zip setuptools-50.3.2.zip pip-20.0.1.tar.gz && \
+    rm -r setuptools-40.6.2 setuptools-50.3.2 pip-20.0.1
+RUN rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
+  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip3
 
 
 # remove them when apt-get support 2.27 and higher version
@@ -84,29 +105,29 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
-    pip3 --no-cache-dir install ipykernel==4.6.0 wheel && \
-    pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+RUN pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip3.6 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip3.7 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip3.8 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.9 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.9 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip --no-cache-dir install ipykernel==4.6.0 wheel 
 
 #For docstring checker
-RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
-    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort && \
     pip3.7 --no-cache-dir install pylint pytest astroid isort && \
     pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.9 --no-cache-dir install pylint pytest astroid isort && \
     pip --no-cache-dir install pylint pytest astroid isort
 
 COPY ./python/requirements.txt /root/
-RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
-    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
     pip3.7 --no-cache-dir install -r /root/requirements.txt && \
     pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.9 --no-cache-dir install -r /root/requirements.txt && \
     pip --no-cache-dir install -r /root/requirements.txt
 
 
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
index 7d5e019443229e116599f804f714e599fcc72fbc..393bd045fb7f818d9b32fd9aa35493557a475674 100644
--- a/tools/dockerfile/build_scripts/build.sh
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -24,15 +24,13 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="3.8.0 3.7.0 3.6.0 3.5.1 2.7.15"
+CPYTHON_VERSIONS="3.9.0 3.8.0 3.7.0 3.6.0"
 
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
 OPENSSL_ROOT=openssl-1.0.2g
 OPENSSL_HASH=b784b1b3907ce39abf4098702dade6365522a253ad1552e267a9a0e89594aa33
 PATCHELF_HASH=f2aa40a6148cb3b0ca807a1bf836b081793e55ec9e5540a5356d800132be7e0a
-CURL_ROOT=curl-7.49.1
-CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
 AUTOCONF_ROOT=autoconf-2.69
 AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 
@@ -79,33 +77,27 @@ build_openssl $OPENSSL_ROOT $OPENSSL_HASH
 mkdir -p /opt/python
 build_cpythons $CPYTHON_VERSIONS
 
-PY35_BIN=/opt/python/cp35-cp35m/bin
 PY36_BIN=/opt/python/cp36-cp36m/bin
 PY37_BIN=/opt/python/cp37-cp37m/bin
 PY38_BIN=/opt/python/cp38-cp38m/bin
+PY39_BIN=/opt/python/cp39-cp39m/bin
 # NOTE Since our custom manylinux image builds pythons with shared
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib:$(dirname ${PY39_BIN})/lib"
 
 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
 # And it's not clear how up-to-date that is anyway
 # So let's just use the same one pip and everyone uses
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi
-ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY37_BIN})/lib" $PY37_BIN/pip install certifi
+ln -s $($PY37_BIN/python -c 'import certifi; print(certifi.where())') \
       /opt/_internal/certs.pem
 # If you modify this line you also have to modify the versions in the
 # Dockerfiles:
 export SSL_CERT_FILE=/opt/_internal/certs.pem
 
-# Install newest curl
-build_curl $CURL_ROOT $CURL_HASH
-rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
-hash -r
-curl --version
-curl-config --features
 
 # Install patchelf (latest with unreleased bug fixes)
 # FIXME(typhoonzero): restore this when the link is fixed.
@@ -117,8 +109,8 @@ curl-config --features
 yum install -y patchelf
 
 # Install latest pypi release of auditwheel
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
-ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
+#LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
+#ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
 
 # Clean up development headers and other unnecessary stuff for
 # final image
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index d3098686594c0d35225dc014c16720cc99aa4b85..8f4f88328aa44f2bbf658cb4fbf7bb6e39f722ba 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -93,8 +93,8 @@ function do_cpython_build {
     rm -rf Python-$py_ver
     # Some python's install as bin/python3. Make them available as
     # bin/python.
-    if [ -e ${prefix}/bin/python3 ]; then
-        ln -s python3 ${prefix}/bin/python
+    if [ -e ${prefix}/bin/python3.6 ]; then
+        ln -s python3.6 ${prefix}/bin/python
     fi
     if [ -e ${prefix}/bin/python3.7 ]; then
         ln -s python3.7 ${prefix}/bin/python
@@ -102,8 +102,17 @@ function do_cpython_build {
     if [ -e ${prefix}/bin/python3.8 ]; then
         ln -s python3.8 ${prefix}/bin/python
     fi
+    if [ -e ${prefix}/bin/python3.9 ]; then
+        ln -s python3.9 ${prefix}/bin/python
+    fi
     # NOTE Make libpython shared library visible to python calls below
-    LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python get-pip.py
+    if [ -e ${prefix}/bin/python3.6 ]; then
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python ez_setup.py
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python -m easy_install pip
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python -m pip install --upgrade pip==20.3.3
+    else
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python get-pip.py
+    fi
     LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
     cd /
     ls ${MY_DIR}
@@ -134,6 +143,8 @@ function build_cpythons {
             GET_PIP_URL="https://bootstrap.pypa.io/2.7/get-pip.py"
         elif [ ${py_ver} == "3.5.1" ]  ;then
             GET_PIP_URL="https://bootstrap.pypa.io/3.5/get-pip.py"
+        elif [ ${py_ver} == "3.6.0" ]  ;then
+            GET_PIP_URL="https://bootstrap.pypa.io/ez_setup.py"
         fi
 
         check_var $GET_PIP_URL
@@ -141,6 +152,7 @@ function build_cpythons {
         build_cpython $py_ver
     done
     rm get-pip.py
+    rm ez_setup.py
 }
 
 
diff --git a/tools/dockerfile/build_scripts/install_gcc.sh b/tools/dockerfile/build_scripts/install_gcc.sh
index e75021b2a9b6531701ef1365e9e6195b61770632..e744e9ddac66e625615badb5739e7952754ba784 100644
--- a/tools/dockerfile/build_scripts/install_gcc.sh
+++ b/tools/dockerfile/build_scripts/install_gcc.sh
@@ -43,4 +43,18 @@ if [ "$1" == "gcc82" ]; then
   ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \
   ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
   cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
+elif [ "$1" == "gcc54" ]; then
+  wget -q http://ftp.tsukuba.wide.ad.jp/software/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2 
+  tar -xvf gcc-5.4.0.tar.bz2 && \
+  cd gcc-5.4.0 && \
+  unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+  ./contrib/download_prerequisites && \
+  cd .. && mkdir temp_gcc54 && cd temp_gcc54 && \
+  ../gcc-5.4.0/configure --prefix=/usr/local/gcc-5.4 --enable-checking=release --enable-languages=c,c++ --disable-multilib && \
+  make -j8 && make install
+  cd .. && rm -rf temp_gcc54
+  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+  ln -s /usr/local/gcc-5.4/lib64/libgfortran.so.5 ${lib_so_5} && \
+  ln -s /usr/local/gcc-5.4/lib64/libstdc++.so.6 ${lib_so_6} && \
+  cp /usr/local/gcc-5.4/lib64/libstdc++.so.6.0.21 ${lib_path}
 fi
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index b06b3d44c6ec6b32688521a76f26b82acb4e0998..07f186f3d4e8cdc36285faa223d81e4e0d38b078 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -17,7 +17,7 @@
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
-elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ]; then
+elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ]; then
   if [ -f "/etc/redhat-release" ];then
     rm -f /usr/local/lib/libnccl.so 
     wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.7.8-1+cuda10.2.x86_64.rpm
diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh
index e5ec70d2f378d180a08a86d705f3e662a211dc91..1df8d0f4568fbbc69dc5c888c3e43c8fa30855d5 100644
--- a/tools/dockerfile/build_scripts/install_trt.sh
+++ b/tools/dockerfile/build_scripts/install_trt.sh
@@ -21,6 +21,11 @@ if [[ "$VERSION" == "10.1" ]];then
   tar -zxf TensorRT6-cuda10.1-cudnn7.tar.gz -C /usr/local
   cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/include/* /usr/include/ && cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/lib/* /usr/lib/
   rm TensorRT6-cuda10.1-cudnn7.tar.gz
+elif [[ "$VERSION" == "11.2" ]];then
+  wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT7-cuda11.1-cudnn8.1.tar.gz --no-check-certificate
+  tar -zxf TensorRT7-cuda11.1-cudnn8.1.tar.gz -C /usr/local
+  cp -rf /usr/local/TensorRT-7.2.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-7.2.3.4/lib/* /usr/lib/
+  rm TensorRT7-cuda11.1-cudnn8.1.tar.gz
 elif [[ "$VERSION" == "11.0" ]];then
   wget -q https://paddle-ci.cdn.bcebos.com/TRT/TensorRT-7.1.3.4.Ubuntu-16.04.x86_64-gnu.cuda-11.0.cudnn8.0.tar.gz --no-check-certificate
   tar -zxf TensorRT-7.1.3.4.Ubuntu-16.04.x86_64-gnu.cuda-11.0.cudnn8.0.tar.gz -C /usr/local
diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh
index 490bff2282682637696d74982d2f2e481a55b942..6ea2a8f836fc0f62d5dacf2334b4e6b60a4a9e88 100755
--- a/tools/dockerfile/centos7_manylinux.sh
+++ b/tools/dockerfile/centos7_manylinux.sh
@@ -20,17 +20,21 @@ REPO="${REPO:-paddledocker}"
 
 function make_cuda9cudnn7(){
   sed 's/<baseimg>/9.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+
 }
 
 
 function make_cuda10cudnn7() {
   sed 's/<baseimg>/10.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+
 }
 
 
 function make_cuda101cudnn7() {
   sed 's/<baseimg>/10.1-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
 }
 
 function make_cuda102cudnn7() {
@@ -48,6 +52,11 @@ function make_cuda11cudnn8() {
   sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
 }
 
+function make_cuda112cudnn8() {
+  sed 's/<baseimg>/11.2.1-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
+}
+
 function main() {
   local CMD=$1 
   case $CMD in
@@ -69,6 +78,9 @@ function main() {
     cuda11cudnn8)
       make_cuda11cudnn8
      ;;
+    cuda112cudnn8)
+      make_cuda112cudnn8
+     ;;
     *)
       echo "Make dockerfile error, Without this paramet."
       exit 1
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 15196e30516ef9e0984e536b3acc77f34a6607bd..e61a4eb3dbd0c16edeff03771f2031eefb8d1110 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -19,8 +19,14 @@ function make_ubuntu_dockerfile(){
   sed "s/<baseimg>/10.1-cudnn7-devel-ubuntu16.04/g" ./Dockerfile.ubuntu >${dockerfile_name}
   sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
-  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
      tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+  sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
+    tar -xvf git-2.17.1.tar.gz \&\& \
+    cd git-2.17.1 \&\& \
+    ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \
+    make -j8 \&\& make install " ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub " ${dockerfile_name}
   sed -i "s#<install_gcc>#WORKDIR /usr/bin \\
     COPY tools/dockerfile/build_scripts /build_scripts \\
     RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \\
@@ -30,9 +36,9 @@ function make_ubuntu_dockerfile(){
     RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\
     RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
     ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
-  sed -i "s#bash /build_scripts/install_nccl2.sh#wget --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
+  sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
     RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
-    RUN apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 --allow-change-held-packages #g" ${dockerfile_name}
+    RUN apt update \&\& apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name}
 }
 
 
@@ -41,12 +47,16 @@ function make_centos_dockerfile(){
   sed "s/<baseimg>/11.0-cudnn8-devel-centos7/g" Dockerfile.centos >${dockerfile_name}
   sed -i "s#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts  ./build_scripts#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
-  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/cc && ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN yum install -y pigz graphviz" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/cc && ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/g++ && ln -s /usr/local/gcc-5.4/bin/g++ /usr/bin/g++" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/c++ && ln -s /usr/local/gcc-5.4/bin/c++ /usr/bin/c++" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN rm -f /usr/bin/gcc && ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/gcc" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so \\
     RUN ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/" ${dockerfile_name}
   sed -i $"${dockerfile_line}i RUN wget --no-check-certificate -q  https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \\
-    RUN tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" ${dockerfile_name}
+    RUN tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" ${dockerfile_name}
 }
 
 
diff --git a/tools/dockerfile/icode.sh b/tools/dockerfile/icode.sh
index da3ffb8c77db71051edd725e70a57e28dbaf2dd6..973975fe7f7373fd9643f5356d49e4d5c25e0a79 100755
--- a/tools/dockerfile/icode.sh
+++ b/tools/dockerfile/icode.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 
 function install_gcc(){
   sed -i 's#<install_gcc>#RUN apt-get update \
diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh
index b7d0d8e3e2aac7d847fd8adfb2b227c71f3a21a6..0de9f82aceec6650169dd2b3a9d48bc2f9f8614d 100755
--- a/tools/dockerfile/ubuntu16_dev.sh
+++ b/tools/dockerfile/ubuntu16_dev.sh
@@ -33,40 +33,44 @@ function ref_whl(){
   fi
 
   if [[ ${WITH_GPU} != "ON" ]]; then
-    ref_gcc = ""
+    ref_gcc=""
   elif [[ ${gcc_version} == "8.2.0" ]];then
     ref_gcc=_gcc8.2
   fi
 
   if [[ ${ref_CUDA_MAJOR} == "11.0" ]];then
       ref_version=.post110
+  elif [[ ${ref_CUDA_MAJOR} == "11.2" ]];then
+      ref_version=.post112
   elif [[ ${ref_CUDA_MAJOR} == "10" ]];then
       ref_version=.post100
   elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then
       ref_version=.post101
-  elif [[ ${ref_CUDA_MAJOR} == "10.2" ]];then
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} == "develop" ]];then
+      ref_version=.post102
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} != "develop" ]];then
       ref_version=""
   elif [[ ${ref_CUDA_MAJOR} == "9" ]];then
       ref_version=.post90
   fi
+
+  ref_dev=2.1.0.dev0
   
   ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
   
-  if [[ ${PADDLE_VERSION} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  else
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  fi
-  
-  if [[ ${PADDLE_VERSION} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+  if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp36-cp36m-linux_x86_64.whl
@@ -106,7 +110,7 @@ function install_gcc(){
   else
     sed -i 's#<install_gcc>#RUN apt-get update \
       WORKDIR /usr/bin \
-      RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' Dockerfile.tmp
+      RUN apt install -y gcc g++ #g' Dockerfile.tmp
   fi
 }
 
diff --git a/tools/dockerfile/ubuntu18_dev.sh b/tools/dockerfile/ubuntu18_dev.sh
index 19572f639bcf5f6cd53577b9cd8df02025d2db7a..c72243ef0521e3bf144790d0ee4ac24ceedce61b 100755
--- a/tools/dockerfile/ubuntu18_dev.sh
+++ b/tools/dockerfile/ubuntu18_dev.sh
@@ -33,40 +33,44 @@ function ref_whl(){
   fi
 
   if [[ ${WITH_GPU} != "ON" ]]; then
-    ref_gcc = ""
+    ref_gcc=""
   elif [[ ${gcc_version} == "8.2.0" ]];then
     ref_gcc=_gcc8.2
   fi
 
   if [[ ${ref_CUDA_MAJOR} == "11.0" ]];then
       ref_version=.post110
+  elif [[ ${ref_CUDA_MAJOR} == "11.2" ]];then
+      ref_version=.post112
   elif [[ ${ref_CUDA_MAJOR} == "10" ]];then
       ref_version=.post100
   elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then
       ref_version=.post101
-  elif [[ ${ref_CUDA_MAJOR} == "10.2" ]];then
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} == "develop" ]];then
+      ref_version=.post102
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} != "develop" ]];then
       ref_version=""
   elif [[ ${ref_CUDA_MAJOR} == "9" ]];then
       ref_version=.post90
   fi
+
+  ref_dev=2.1.0.dev0
   
   ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
   
-  if [[ ${PADDLE_VERSION} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  else
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  fi
-  
-  if [[ ${PADDLE_VERSION} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+  if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp36-cp36m-linux_x86_64.whl
@@ -107,7 +111,7 @@ function install_gcc(){
   else
     sed -i 's#<install_gcc>#RUN apt-get update \
       WORKDIR /usr/bin \
-      RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' Dockerfile.tmp
+      RUN apt install -y gcc g++ #g' Dockerfile.tmp
   fi
 }
 
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 10f486f8fd4f633ce5c49cd878540aaef6a7d3ad..83c758d0aa8b8faf35a21fe8b8b794745669b147 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 PADDLE_ROOT=/home
 mkdir ${PADDLE_ROOT}
 cd ${PADDLE_ROOT}
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
index 81eb19dc0661e698bab5c8f9e7e432c45b229977..bce338a8619e646c958d4334eb103ddd6ed8aaab 100755
--- a/tools/get_cpu_info.sh
+++ b/tools/get_cpu_info.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 if [ "`uname -s`" != "Linux" ]; then
   echo "Current scenario only support in Linux yet!"
   exit 0
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index e97f69faf02c1fe32abcd2b919508e25571b7659..001f380049f92d1bb667d85b49841bc006c8517f 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -20,12 +20,15 @@ import sys
 import time
 import subprocess
 import requests
+import urllib.request
+import ssl
 import platform
 from github import Github
 
 PADDLE_ROOT = os.getenv('PADDLE_ROOT', '/paddle/')
 PADDLE_ROOT += '/'
 PADDLE_ROOT = PADDLE_ROOT.replace('//', '/')
+ssl._create_default_https_context = ssl._create_unverified_context
 
 
 class PRChecker(object):
@@ -75,7 +78,10 @@ class PRChecker(object):
             if ix // 2 == 0:
                 proxy = ''
             else:
-                proxy = '--no-proxy'
+                if platform.system() == 'Windows':
+                    proxy = '-Y off'
+                else:
+                    proxy = '--no-proxy'
             code = subprocess.call(
                 'wget -q {} --no-check-certificate {}'.format(proxy, url),
                 shell=True)
@@ -88,6 +94,33 @@ class PRChecker(object):
             ix += 1
         return False
 
+    def __urlretrieve(self, url, filename):
+        ix = 1
+        with_proxy = urllib.request.getproxies()
+        without_proxy = {'http': '', 'http': ''}
+        while ix < 6:
+            if ix // 2 == 0:
+                cur_proxy = urllib.request.ProxyHandler(without_proxy)
+            else:
+                cur_proxy = urllib.request.ProxyHandler(with_proxy)
+            opener = urllib.request.build_opener(cur_proxy,
+                                                 urllib.request.HTTPHandler)
+            urllib.request.install_opener(opener)
+            try:
+                urllib.request.urlretrieve(url, filename)
+            except Exception as e:
+                print(e)
+                print(
+                    'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'.
+                    format(url, ix, ix * 10, proxy))
+                continue
+            else:
+                return True
+            time.sleep(ix * 10)
+            ix += 1
+
+        return False
+
     def get_pr_files(self):
         """ Get files in pull request. """
         page = 0
@@ -202,9 +235,9 @@ class PRChecker(object):
         check_added_ut = False
         ut_list = []
         file_ut_map = None
-        ret = self.__wget_with_retry(
+        ret = self.__urlretrieve(
             'https://sys-p0.bj.bcebos.com/prec/file_ut.json{}'.format(
-                self.suffix))
+                self.suffix), 'file_ut.json{}'.format(self.suffix))
         if not ret:
             print('PREC download file_ut.json failed')
             exit(1)
@@ -213,9 +246,11 @@ class PRChecker(object):
         for f in self.get_pr_files():
             current_system = platform.system()
             if current_system == "Darwin" or current_system == "Windows":
-                f = f.replace(PADDLE_ROOT, '/paddle/', 1)
-                f = f.replace('//', '/')
-            if f not in file_ut_map:
+                f_judge = f.replace(PADDLE_ROOT, '/paddle/', 1)
+                f_judge = f_judge.replace('//', '/')
+            else:
+                f_judge = f
+            if f_judge not in file_ut_map:
                 if f.endswith('.md'):
                     ut_list.append('md_placeholder')
                 elif f.endswith('.h') or f.endswith('.cu'):
@@ -245,7 +280,7 @@ class PRChecker(object):
                 if self.is_only_comment(f):
                     ut_list.append('map_comment_placeholder')
                 else:
-                    ut_list.extend(file_ut_map.get(f))
+                    ut_list.extend(file_ut_map.get(f_judge))
         ut_list = list(set(ut_list))
 
         if check_added_ut:
@@ -255,9 +290,9 @@ class PRChecker(object):
                     ut_list.append(ut.rstrip('\r\n'))
 
         if ut_list:
-            ret = self.__wget_with_retry(
+            ret = self.__urlretrieve(
                 'https://sys-p0.bj.bcebos.com/prec/prec_delta{}'.format(
-                    self.suffix))
+                    self.suffix), 'prec_delta{}'.format(self.suffix))
             if ret:
                 with open('prec_delta' + self.suffix) as delta:
                     for ut in delta:
diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py
index 18ebdb0031747fb957fb77e078f7c4cd81142328..4805c909c1ba41dc49115ccec91ac23649922f2c 100644
--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
@@ -25,6 +25,13 @@ def download_file():
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_win')
     else:
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut')
+    try:
+        import paddle.fluid.core as core
+        if core.is_compiled_with_rocm():
+            url = "https://sys-p0.bj.bcebos.com/prec/{}".format(
+                'disable_ut_rocm_ci')
+    except:
+        pass
     f = requests.get(url)
     data = f.text
     status_code = f.status_code
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index a5239e534e2f548510bb4336fe6dffc583202481..9d03ae22de28f2946f98fde81ab52ae0339f3dc0 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -16,166 +16,148 @@ import sys
 import os
 
 # *=======These unittest doesn't occupy GPU memory, just run as CPU unittest=======* #
-# It run 8 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# It run 16 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
 # just remove it from this list.
 CPU_PARALLEL_JOB = [
-    'test_row_conv',
-    'test_nce',
-    'test_conv3d_mkldnn_op',
-    'dim_test',
-    'test_limit_gpu_memory',
-    'profiler_test',
-    'test_dequantize_mkldnn_op',
-    'test_elementwise_add_bf16_mkldnn_op',
-    'test_rpn_target_assign_op',
-    'test_hash_op',
-    'reader_blocking_queue_test',
-    'jit_kernel_test',
-    'test_tdm_child_op',
-    'test_simplify_with_basic_ops_pass',
-    'test_sequence_last_step',
-    'test_sequence_first_step',
-    'test_seq_concat_fc_fuse_pass',
-    'test_fc_gru_fuse_pass',
-    'test_dataset_imdb',
-    'dlpack_tensor_test',
-    'check_reduce_rank_test',
+    'test_static_save_load_large',
+    'version_test',
     'var_type_traits_test',
     'var_type_inference_test',
+    'variable_test',
+    'unroll_array_ops_test',
+    'tuple_test',
     'to_string_test',
+    'timer_test',
     'threadpool_test',
+    'test_zeros_op',
+    'test_while_op',
+    'test_weight_quantization_mobilenetv1',
     'test_version',
     'test_var_info',
     'test_var_conv_2d',
+    'test_utils',
     'test_unique_name',
     'test_transpose_int8_mkldnn_op',
     'test_transpose_bf16_mkldnn_op',
+    'test_trainer_desc',
     'test_trainable',
     'test_teacher_student_sigmoid_loss_op',
     'test_tdm_sampler_op',
+    'test_tdm_child_op',
+    'test_sysconfig',
+    'test_sync_batch_norm_pass',
     'test_switch',
     'test_static_shape_inferrence_for_shape_tensor',
-    'test_squared_mat_sub_fuse_pass',
-    'test_sequence_scatter_op',
-    'test_sequence_scatter_op',
-    'test_scaled_dot_product_attention',
-    'test_rnn_memory_helper_op',
-    'test_requantize_mkldnn_op',
-    'test_quantize_transpiler',
-    'test_quantize_mkldnn_op',
-    'test_py_reader_sample_generator',
-    'test_parallel_executor_seresnext_with_reduce_cpu',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
-    'test_parallel_executor_seresnext_base_cpu',
-    'test_parallel_dygraph_sync_batch_norm',
-    'test_origin_info',
-    'test_multiclass_nms_op',
-    'test_mkldnn_conv_bias_fuse_pass',
-    'test_mkldnn_conv_activation_fuse_pass',
-    'test_matrix_nms_op',
-    'test_ir_graph',
-    'test_inference_api',
-    'test_infer_shape',
-    'test_infer_no_need_buffer_slots',
-    'test_imperative_numpy_bridge',
-    'test_imperative_decorator',
-    'test_hooks',
-    'test_gpu_package_without_gpu_device',
-    'test_global_var_getter_setter',
-    'test_get_set_flags',
-    'test_fusion_repeated_fc_relu_op',
-    'test_fused_emb_seq_pool_op',
-    'test_fleet_base_4',
-    'test_fc_lstm_fuse_pass',
-    'test_executor_feed_non_tensor',
-    'test_executor_check_feed',
-    'test_executor_and_use_program_cache',
-    'test_exception',
-    'test_error_clip',
-    'test_embedding_eltwise_layernorm_fuse_pass',
-    'test_dyn_rnn',
-    'test_dpsgd_op',
-    'test_distributed_reader',
-    'test_directory_migration',
-    'test_dataset_wmt',
-    'test_dataset_uci_housing',
-    'test_dataset_cifar',
-    'test_data_feeder',
-    'test_cudnn_placement_pass',
-    'test_conv3d_layer',
-    'test_concat_bf16_mkldnn_op',
-    'test_common_infer_shape_functions',
-    'test_check_import_scipy',
-    'test_calc_gradient',
-    'test_bipartite_match_op',
-    'test_attention_lstm_op',
-    'test_array_read_write_op',
-    'stringprintf_test',
-    'stringpiece_test',
-    'selected_rows_test',
-    'scope_test',
-    'reader_test',
-    'prune_test',
-    'op_tester',
-    'eigen_test',
-    'device_worker_test',
-    'cudnn_helper_test',
-    'cudnn_desc_test',
-    'tuple_test',
-    'timer_test',
-    'test_zeros_op',
-    'test_while_op',
-    'test_utils',
     'test_static_analysis',
+    'test_squared_mat_sub_fuse_pass',
     'test_split_and_merge_lod_tensor_op',
     'test_spawn_and_init_parallel_env',
     'test_slice_var',
+    'test_skip_layernorm_fuse_pass',
+    'test_simplify_with_basic_ops_pass',
     'test_similarity_focus_op',
     'test_shuffle_batch_op',
     'test_shrink_rnn_memory',
     'test_set_bool_attr',
     'test_sequence_topk_avg_pooling',
+    'test_sequence_scatter_op',
+    'test_sequence_scatter_op',
+    'test_sequence_last_step',
+    'test_sequence_first_step',
+    'test_seqpool_cvm_concat_fuse_pass',
+    'test_seqpool_concat_fuse_pass',
+    'test_seq_concat_fc_fuse_pass',
     'test_selected_rows',
     'test_scope',
+    'test_scale_matmul_fuse_pass',
+    'test_scaled_dot_product_attention',
     'test_sampling_id_op',
     'test_runtime_and_compiletime_exception',
     'test_run_fluid_by_module_or_command_line',
+    'test_rpn_target_assign_op',
+    'test_row_conv',
+    'test_rnn_memory_helper_op',
     'test_retinanet_detection_output',
+    'test_reshape_transpose_matmul_mkldnn_fuse_pass',
+    'test_reshape_bf16_op',
     'test_require_version',
+    'test_requantize_mkldnn_op',
+    'test_repeated_fc_relu_fuse_pass',
     'test_repeated_fc_relu_fuse_pass',
     'test_registry',
+    'test_reducescatter_api',
+    'test_reducescatter',
     'test_recurrent_op',
     'test_recommender_system',
     'test_query_op',
+    'test_quantize_transpiler',
+    'test_quantize_mkldnn_op',
     'test_quantization_mkldnn_pass',
+    'test_quant_int8_resnet50_mkldnn',
+    'test_quant_int8_mobilenetv2_mkldnn',
+    'test_quant_int8_mobilenetv1_mkldnn',
+    'test_quant_int8_googlenet_mkldnn',
+    'test_quant2_int8_resnet50_range_mkldnn',
+    'test_quant2_int8_resnet50_mkldnn',
+    'test_quant2_int8_resnet50_channelwise_mkldnn',
+    'test_quant2_int8_mobilenetv1_mkldnn',
     'test_quant2_int8_mkldnn_pass',
-    'test_pybind_interface',
+    'test_quant2_int8_ernie_mkldnn',
+    'test_py_reader_sample_generator',
+    'test_py_reader_return_list',
+    'test_py_reader_lod_level_share',
     'test_py_reader_error_msg',
+    'test_pyramid_hash_op',
+    'test_pybind_interface',
+    'test_ps_dispatcher',
     'test_prune',
+    'test_protobuf_descs',
     'test_protobuf',
     'test_progressbar',
     'test_program_to_string',
     'test_program_code',
     'test_program',
     'test_precision_recall_op',
+    'test_post_training_quantization_resnet50',
+    'test_post_training_quantization_mobilenetv1',
+    'test_post_training_quantization_mnist',
     'test_positive_negative_pair_op',
-    'test_parallel_executor_run_load_infer_program',
+    'test_paddle_inference_api',
+    'test_origin_info',
     'test_op_version',
     'test_op_support_gpu',
+    'test_operator_desc',
+    'test_operator',
     'test_ones_op',
     'test_npair_loss_op',
     'test_nn_functional_embedding_static',
+    'test_nce',
     'test_name_scope',
+    'test_naive_executor',
     'test_multiprocess_dataloader_iterable_dataset_split',
+    'test_multiprocess_dataloader_exception',
+    'test_multihead_matmul_fuse_pass',
+    'test_multi_gru_seq_fuse_pass',
     'test_multi_gru_mkldnn_op',
+    'test_multi_gru_fuse_pass',
+    'test_multiclass_nms_op',
     'test_mul_int8_mkldnn_op',
     'test_mkldnn_scale_matmul_fuse_pass',
+    'test_mkldnn_placement_pass',
+    'test_mkldnn_op_nhwc',
     'test_mkldnn_op_inplace',
     'test_mkldnn_matmul_transpose_reshape_fuse_pass',
+    'test_mkldnn_matmul_op_output_fuse_pass',
+    'test_mkldnn_inplace_pass',
     'test_mkldnn_inplace_fuse_pass',
     'test_mkldnn_cpu_bfloat16_pass',
+    'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
+    'test_mkldnn_conv_bias_fuse_pass',
+    'test_mkldnn_conv_activation_fuse_pass',
     'test_mine_hard_examples_op',
     'test_memory_usage',
+    'test_matrix_nms_op',
+    'test_matmul_transpose_reshape_fuse_pass',
     'test_matmul_mkldnn_op',
     'test_matmul_bf16_mkldnn_op',
     'test_math_op_patch',
@@ -186,52 +168,100 @@ CPU_PARALLEL_JOB = [
     'test_lod_tensor_array_ops',
     'test_lod_tensor_array',
     'test_lod_rank_table',
-    'test_lod_array_length_op',
     'test_locality_aware_nms_op',
     'test_load_vars_shape_check',
     'test_load_op_xpu',
     'test_load_op',
-    'test_linear_chain_crf_op',
+    'test_limit_gpu_memory',
     'test_layer_norm_mkldnn_op',
     'test_layer_norm_bf16_mkldnn_op',
+    'test_layer',
     'test_lambv2_op',
+    'test_is_test_pass',
     'test_ir_skip_layernorm_pass',
+    'test_ir_graph',
     'test_io_save_load',
     'test_input_spec',
+    'test_infer_shape',
+    'test_infer_no_need_buffer_slots',
     'test_inference_model_io',
+    'test_inference_api',
+    'test_imperative_signal_handler',
+    'test_imperative_numpy_bridge',
+    'test_imperative_group',
+    'test_imperative_decorator',
+    'test_imperative_data_loader_process',
+    'test_imperative_data_loader_exit_func',
     'test_imperative_base',
     'test_image_classification_layer',
     'test_image',
     'test_ifelse_basic',
     'test_hsigmoid_op',
+    'test_hooks',
+    'test_hash_op',
+    'test_group',
+    'test_graph_pattern_detector',
+    'test_gpu_package_without_gpu_device',
+    'test_global_var_getter_setter',
+    'test_get_set_flags',
     'test_generator',
     'test_generate_proposal_labels_op',
     'test_generate_mask_labels_op',
     'test_gast_with_compatibility',
     'test_fusion_squared_mat_sub_op',
+    'test_fusion_seqpool_cvm_concat_op',
+    'test_fusion_seqpool_concat_op',
+    'test_fusion_seqexpand_concat_fc_op',
     'test_fusion_seqconv_eltadd_relu_op',
+    'test_fusion_repeated_fc_relu_op',
     'test_fusion_lstm_op',
     'test_fusion_gru_op',
+    'test_fusion_gru_mkldnn_op',
     'test_fusion_gru_int8_mkldnn_op',
     'test_fusion_gru_bf16_mkldnn_op',
+    'test_fused_emb_seq_pool_op',
     'test_fused_embedding_fc_lstm_op',
     'test_function_spec',
     'test_full_op',
+    'test_fs_interface',
+    'test_fs',
     'test_framework_debug_str',
     'test_fp16_utils',
+    'test_fleet_util',
+    'test_fleet_unitaccessor',
+    'test_fleet_runtime',
+    'test_fleet_rolemaker_init',
+    'test_bf16_utils',
     'test_fleet_rolemaker_4',
+    'test_fleet_rolemaker_3',
+    'test_fleet_rolemaker',
+    'test_fleet_nocvm_1',
+    'test_fleet_base_4',
+    'test_fleet',
+    'test_fleet',
     'test_flags_use_mkldnn',
+    'test_flags_mkldnn_ops_on_off',
     'test_filter_by_instag_op',
     'test_fetch_var',
     'test_fetch_handler',
     'test_feed_fetch_method',
     'test_fc_mkldnn_op',
     'test_fc_lstm_fuse_pass',
+    'test_fc_lstm_fuse_pass',
     'test_fc_gru_fuse_pass',
+    'test_fc_gru_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass',
     'test_fc_bf16_mkldnn_op',
-    'test_entry_attr',
+    'test_executor_feed_non_tensor',
+    'test_executor_check_feed',
+    'test_executor_and_use_program_cache',
+    'test_exception',
+    'test_error_clip',
     'test_entry_attr2',
+    'test_entry_attr',
+    'test_embedding_eltwise_layernorm_fuse_pass',
     'test_elementwise_mul_bf16_mkldnn_op',
+    'test_elementwise_add_bf16_mkldnn_op',
     'test_eager_deletion_recurrent_op',
     'test_eager_deletion_padding_rnn',
     'test_eager_deletion_mnist',
@@ -239,203 +269,858 @@ CPU_PARALLEL_JOB = [
     'test_eager_deletion_conditional_block',
     'test_dynrnn_static_input',
     'test_dynrnn_gradient_check',
+    'test_dyn_rnn',
     'test_dygraph_mode_of_unittest',
+    'test_dpsgd_op',
+    'test_downpoursgd',
     'test_download',
     'test_distributions',
+    'test_distributed_reader',
+    'test_directory_migration',
     'test_detection_map_op',
     'test_desc_clone',
+    'test_dequantize_mkldnn_op',
     'test_depthwise_conv_mkldnn_pass',
     'test_deprecated_memory_optimize_interfaces',
     'test_default_scope_funcs',
     'test_default_dtype',
+    'test_debugger',
+    'test_dataset_wmt',
     'test_dataset_voc',
+    'test_dataset_uci_housing',
     'test_dataset_movielens',
     'test_dataset_imikolov',
+    'test_dataset_imdb',
     'test_dataset_conll05',
+    'test_dataset_cifar',
+    'test_dataloader_unkeep_order',
+    'test_dataloader_keep_order',
+    'test_dataloader_dataset',
     'test_data_generator',
+    'test_data_feeder',
     'test_data',
     'test_cyclic_cifar_dataset',
+    'test_cudnn_placement_pass',
     'test_crypto',
+    'test_crf_decoding_op',
+    'test_create_parameter',
     'test_create_op_doc_string',
     'test_create_global_var',
+    'test_cpu_quantize_squash_pass',
+    'test_cpu_quantize_placement_pass',
+    'test_cpu_quantize_pass',
+    'test_cpu_bfloat16_placement_pass',
+    'test_cpu_bfloat16_pass',
+    'test_conv_elementwise_add_mkldnn_fuse_pass',
+    'test_conv_concat_relu_mkldnn_fuse_pass',
+    'test_conv_bias_mkldnn_fuse_pass',
+    'test_conv_batch_norm_mkldnn_fuse_pass',
+    'test_conv_activation_mkldnn_fuse_pass',
     'test_conv3d_transpose_layer',
+    'test_conv3d_mkldnn_op',
+    'test_conv3d_layer',
     'test_conv2d_transpose_layer',
     'test_conv2d_mkldnn_op',
     'test_conv2d_layer',
     'test_conv2d_int8_mkldnn_op',
     'test_conv2d_bf16_mkldnn_op',
+    'test_context_manager',
     'test_const_value',
     'test_conditional_block',
     'test_concat_int8_mkldnn_op',
+    'test_concat_bf16_mkldnn_op',
     'test_compat',
-    'test_collective_base',
-    'test_collective_api_base',
+    'test_common_infer_shape_functions',
     'test_chunk_eval_op',
+    'test_check_import_scipy',
+    'test_c_comm_init_all_op',
+    'test_calc_gradient',
     'test_broadcast_to_op',
     'test_broadcast_shape',
     'test_broadcast_error',
+    'test_broadcast',
     'test_bpr_loss_op',
+    'test_boxps',
+    'test_bipartite_match_op',
+    'test_benchmark',
     'test_beam_search_op',
     'test_batch_sampler',
+    'test_batch_norm_act_fuse_pass',
     'test_basic_rnn_name',
+    'test_attention_lstm_op',
+    'test_analyzer',
+    'test_allreduce',
+    'test_allgather',
     'test_aligned_allocator',
+    'system_allocator_test',
+    'stringprintf_test',
+    'stringpiece_test',
+    'split_test',
+    'selected_rows_test',
+    'selected_rows_functor_test',
+    'scope_test',
     'scatter_test',
+    'save_quant2_model_resnet50',
+    'save_quant2_model_gru',
+    'save_quant2_model_ernie',
+    'save_load_util_test',
+    'save_load_op_test',
     'save_load_combine_op_test',
+    'rw_lock_test',
+    'retry_allocator_test',
+    'reader_test',
+    'reader_blocking_queue_test',
+    'prune_test',
     'program_desc_test',
-    'lodtensor_printer_test',
-    'lod_tensor_test',
-    'gather_test',
-    'gather_op_test',
-    'fused_broadcast_op_test',
-    'exception_holder_test',
-    'decorator_test',
-    'ddim_test',
-    'data_layout_transform_test',
-    'cpu_vec_test',
-    'cow_ptr_tests',
-    'conditional_block_op_test',
-    'bfloat16_test',
-    'assign_op_test',
-    'unroll_array_ops_test',
-    'test_seqpool_cvm_concat_fuse_pass',
-    'test_seqpool_concat_fuse_pass',
-    'test_reshape_bf16_op',
-    'test_repeated_fc_relu_fuse_pass',
-    'test_py_reader_return_list',
-    'test_py_reader_lod_level_share',
-    'test_protobuf_descs',
-    'test_paddle_inference_api',
-    'test_operator_desc',
-    'test_operator',
-    'test_mkldnn_matmul_op_output_fuse_pass',
-    'test_mkldnn_inplace_pass',
-    'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
-    'test_layer',
-    'test_is_test_pass',
-    'test_graph_pattern_detector',
-    'test_fusion_seqpool_cvm_concat_op',
-    'test_fusion_seqpool_concat_op',
-    'test_fusion_seqexpand_concat_fc_op',
-    'test_fusion_gru_mkldnn_op',
-    'test_fleet_util',
-    'test_fleet_runtime',
-    'test_fleet_rolemaker_init',
-    'test_flags_mkldnn_ops_on_off',
-    'test_dataset_download',
-    'test_dataloader_unkeep_order',
-    'test_dataloader_keep_order',
-    'test_dataloader_dataset',
-    'test_crf_decoding_op',
-    'test_create_parameter',
-    'test_context_manager',
-    'test_analyzer',
-    'tensor_test',
-    'split_test',
-    'save_load_op_test',
+    'profiler_test',
     'place_test',
+    'pass_test',
     'op_version_registry_test',
+    'op_tester',
     'op_proto_maker_test',
     'op_kernel_type_test',
-    'mask_util_test',
-    'inlined_vector_test',
-    'infer_io_utils_tester',
-    'errors_test',
-    'enforce_test',
-    'dropout_op_test',
-    'data_type_test',
-    'cpu_info_test',
-    'cpu_helper_test',
-    'beam_search_decode_op_test',
-    'auto_growth_best_fit_allocator_test',
-    'test_skip_layernorm_fuse_pass',
-    'test_multihead_matmul_fuse_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
-    'version_test',
-    'variable_test',
-    'test_scale_matmul_fuse_pass',
-    'test_reshape_transpose_matmul_mkldnn_fuse_pass',
-    'test_multi_gru_seq_fuse_pass',
-    'test_multi_gru_fuse_pass',
-    'test_mkldnn_placement_pass',
-    'test_mkldnn_op_nhwc',
-    'test_matmul_transpose_reshape_fuse_pass',
-    'test_fs',
-    'test_fleet',
-    'test_cpu_quantize_squash_pass',
-    'test_cpu_quantize_placement_pass',
-    'test_cpu_quantize_pass',
-    'test_cpu_bfloat16_placement_pass',
-    'test_cpu_bfloat16_pass',
-    'test_conv_elementwise_add_mkldnn_fuse_pass',
-    'test_conv_concat_relu_mkldnn_fuse_pass',
-    'test_conv_bias_mkldnn_fuse_pass',
-    'test_conv_batch_norm_mkldnn_fuse_pass',
-    'test_conv_activation_mkldnn_fuse_pass',
-    'test_benchmark',
-    'test_batch_norm_act_fuse_pass',
-    'selected_rows_functor_test',
-    'save_load_util_test',
-    'pass_test',
     'operator_test',
     'operator_exception_test',
     'op_debug_string_test',
     'op_compatible_info_test',
     'op_call_stack_test',
-    'node_test',
     'no_need_buffer_vars_inference_test',
+    'node_test',
     'nccl_context_test',
+    'mmap_allocator_test',
     'math_function_test',
+    'mask_util_test',
+    'lod_tensor_test',
+    'test_check_abi',
+    'lodtensor_printer_test',
+    'jit_kernel_test',
+    'test_dispatch_jit',
+    'inlined_vector_test',
     'init_test',
+    'infer_io_utils_tester',
     'graph_to_program_pass_test',
     'graph_test',
     'graph_helper_test',
+    'gather_test',
+    'gather_op_test',
+    'fused_broadcast_op_test',
     'float16_test',
+    'exception_holder_test',
+    'errors_test',
+    'enforce_test',
+    'eigen_test',
+    'dropout_op_test',
+    'dlpack_tensor_test',
     'dist_multi_trainer_test',
+    'dim_test',
+    'device_worker_test',
+    'decorator_test',
+    'ddim_test',
+    'data_type_test',
+    'test_check_error',
+    'data_layout_transform_test',
+    'cudnn_helper_test',
+    'cudnn_desc_test',
+    'cpu_vec_test',
+    'cpu_info_test',
+    'cpu_helper_test',
+    'cow_ptr_tests',
+    'conditional_block_op_test',
     'cipher_utils_test',
+    'check_reduce_rank_test',
+    'buffered_allocator_test',
     'broadcast_op_test',
+    'bfloat16_test',
+    'beam_search_decode_op_test',
+    'auto_growth_best_fit_allocator_test',
+    'assign_op_test',
+    'allocator_facade_frac_flags_test',
     'aes_cipher_test',
+    'test_dist_sparse_tensor_load_adagrad',
+    'test_dist_mnist_fp16_allreduce',
+    'test_dist_mnist_gradient_merge',
+    'test_dist_allreduce_op',
+    'test_hdfs3',
+    'test_parallel_dygraph_se_resnext',
+    'test_dist_fleet_ps9',
+    'test_dist_fleet_infer',
+    'test_dist_se_resnext_sync',
+    'test_dist_oneps',
+    'test_dist_sparse_load_ps1',
+    'test_dist_mnist_batch_merge',
+    'test_dist_fleet_ctr',
+    'test_dist_fleet_ps10',
+    'test_parallel_dygraph_transformer',
+    'test_dist_mnist_fleetapi',
+    'test_dist_sparse_tensor_load_adam',
+    'test_dist_fleet_ps4',
+    'test_dist_fleet_heter_program',
+    'test_parallel_dygraph_sparse_embedding_over_height',
+    'test_hdfs2',
+    'test_dist_sharding_save',
+    'test_dist_fleet_ps_gpu_ctr',
+    'test_dist_mnist_backward_deps',
+    'test_dist_fleet_heter_base',
+    'test_dist_sparse_tensor_load_sgd',
+    'test_new_group',
+    'test_dist_mnist_with_program',
+    'test_dist_mnist_pg',
+    'test_dist_sparse_tensor_load_rmsprop',
+    'test_auto_checkpoint2',
+    'test_dist_sparse_tensor_load_ftrl',
+    'test_dist_fleet_ps6',
+    'test_dist_mnist_fleet_save',
+    'test_auto_checkpoint1',
+    'test_dist_fleet_a_sync_optimizer_sync',
+    'test_dist_fleet_ps3',
+    'test_dist_se_resnext_nccl',
+    'test_parallel_dygraph_mnist',
+    'test_auto_checkpoint_multiple',
+    'test_dist_fleet_a_sync_optimizer_auto_async',
+    'test_pipeline',
+    'test_dist_fleet_ps8',
+    'test_dist_fleet_sparse_embedding_ctr',
+    'test_dist_se_resnext_dgc',
+    'test_dist_fleet_ps7',
+    'test_dist_fleet_decay',
+    'test_dist_fleet_a_sync_optimizer_auto_geo',
+    'test_dist_fleet_geo',
+    'test_parallel_dygraph_dataparallel',
+    'test_hdfs1',
+    'test_dist_mnist_dgc_nccl',
+    'test_dist_fleet_ctr2',
+    'test_parallel_dygraph_unused_variables',
+    'test_dist_mnist_multi_comm',
+    'test_dist_sparse_tensor_load_momentum',
+    'test_gen_nccl_id_op',
+    'test_parallel_dygraph_sparse_embedding',
+    'test_dist_mnist_ring_allreduce',
+    'test_fleet_launch_async',
+    'test_dist_fleet_a_sync_optimizer_geo',
+    'test_parallel_dygraph_control_flow',
+    'test_auto_checkpoint',
+    'test_fleet_pipeline_meta_optimizer',
+    'test_dist_fleet_heter_ctr',
+    'test_fleet_graph_execution_meta_optimizer',
+    'test_fleet_run_random_port',
+    'test_dist_fleet_ps5',
+    'test_dist_fleet_a_sync_optimizer_auto',
+    'test_dist_lookup_sparse_table_fuse_ops',
+    'test_dist_fleet_a_sync_optimizer_async',
+    'test_c_comm_init_op',
+    'test_fleet_launch_nproc',
+    'test_dist_fleet_simnet',
+    'test_auto_checkpoint_dist_basic',
+    'test_fleet_launch_cloud',
+    'test_dist_fleet_ps',
+    'test_dist_op',
+    'test_dist_sparse_load_ps0',
+    'test_auto_checkpoint3',
+    'test_dist_fleet_ps2',
+    'test_dist_fleet_grad_clip',
+    'test_custom_concat',
+    'test_analyzer_transformer_fuse',
+    'test_analyzer_seq_pool1_fuse_statis',
+    'test_fc_lstm_fuse_pass_cc',
+    'test_layer_norm_fuse_pass',
+    'test_fc_gru_fuse_pass_cc',
+    'test_analyzer_save_model',
+    'test_fleet_ps',
+    'test_analyzer_multi_model_prediction',
+    'test_fleet_base_3',
+    'test_fleet_base_2',
+    'test_ascend_trigger',
+    'test_fleet_amp_meta_optimizer',
+    'test_fleetrun',
+    'test_check_abi',
+    'dense_table_test',
+    'test_adaptive_pool2d_convert_global_pass',
+    'test_fleet_recompute_meta_optimizer',
+    'test_fleet_fp16_allreduce_meta_optimizer',
+    'test_post_training_quantization_lstm_model',
+    'test_fleet_metric',
+    'test_fleet_gradient_merge_meta_optimizer',
+    'test_fleet_sharding_meta_optimizer',
+    'test_listen_and_serv_op',
+    'test_analyzer_zerocopytensor_tensor',
+    'test_conv_bn_fuse_pass_cc',
+    'test_collective_optimizer',
+    'test_bf16_utils',
+    'test_analyzer_seq_pool1_compare_determine',
+    'test_avoid_twice_initialization',
+    'test_callback_early_stop',
+    'test_fleet_distributed_strategy',
+    'test_launch_coverage',
+    'test_sgd_op_bf16',
+    'test_model_cast_to_bf16',
+    'test_hybrid_parallel_topology',
+    'barrier_table_test',
+    'test_check_error',
+    'test_fleet_lamb_meta_optimizer',
+    'test_fleet_rolemaker_2',
+    'test_distributed_strategy',
+    'test_rnn_cudnn_params_packing',
+    'test_communicator_async',
+    'brpc_utils_test',
+    'test_analyzer_capi_pd_tensor',
+    'test_recv_save_op',
+    'heter_listen_and_server_test',
+    'test_analyzer_capi_ner',
+    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_dgc_optimizer',
+    'test_fleet_cc',
+    'test_repeated_fc_relu_fuse_pass_cc',
+    'heter_server_test',
+    'test_static_save_load_large',
+    'graph_node_test',
+    'test_custom_conj',
+    'test_fleet_private_function',
+    'test_fake_init_op',
+    'brpc_service_sparse_sgd_test',
+    'test_tf32_cudnn',
+    'test_communicator_geo',
+    'test_dispatch_jit',
+    'test_layer_norm_fuse_pass_cc',
+    'test_fleet_dgc_meta_optimizer',
+    'test_fc_fuse_pass_cc',
+    'test_communicator_sync',
+    'test_analyzer_capi',
+    'test_fleet_lars_meta_optimizer',
+    'test_communicator_half_async',
+    'test_fleet_localsgd_meta_optimizer',
+    'test_fleet_amp_init',
+    'test_fleet_checkpoint',
+    'test_analyzer_seq_pool1_fuse_compare_zero_copy',
+    'test_lookup_table_bf16_op',
+    'test_fleet_meta_optimizer_base',
+    'table_test',
+    'test_fleet_rolemaker_new',
+    'test_fleet_graph_executor',
+    'test_multi_out_jit',
+    'test_fleet_utils',
+    'brpc_service_dense_sgd_test',
 ]
 
-# It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TETRAD_PARALLEL_JOB = [
-    'system_allocator_test',
     'buffered_allocator_test',
-    'test_tensor_to_numpy',
+    'allocator_facade_frac_flags_test',
+    'cuda_helper_test',
+    'sequence_padding_test',
+    'test_auto_growth_gpu_memory_limit',
     'test_imperative_framework',
+    'device_context_test',
+    'test_reference_count_pass_last_lived_ops',
+    'copy_same_tensor_test',
+    'float16_gpu_test',
+    'test_leaky_relu_grad_grad_functor',
+    'sequence_pooling_test',
+    'mixed_vector_test',
+    'op_registry_test',
+    'strided_memcpy_test',
+    'selected_rows_functor_gpu_test',
+    'test_prepare_op',
+    'data_device_transform_test',
+    'test_tensor_to_numpy',
     'test_naive_best_fit_gpu_memory_limit',
-    'test_auto_growth_gpu_memory_limit',
+    'vol2col_test',
     'test_imperative_using_non_zero_gpu',
-    'cuda_helper_test',
     'retry_allocator_test',
-    'allocator_facade_frac_flags_test',
+    'system_allocator_test',
+    'test_fc_fuse_pass_cc',
+    'test_fc_lstm_fuse_pass_cc',
+    'test_fc_gru_fuse_pass_cc',
+    'test_conv_bn_fuse_pass_cc',
+    'test_adaptive_pool2d_convert_global_pass',
+    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_layer_norm_fuse_pass_cc',
+    'test_fc_act_mkldnn_fuse_pass',
+    'test_fleet_cc',
+    'tensor_test',
+    'test_repeated_fc_relu_fuse_pass_cc',
+    'test_mkldnn_caching',
+    'test_analyzer_seq_pool1',
+    'test_analyzer_ocr',
+    'test_analyzer_seq_conv1',
+    'test_analyzer_small_dam',
+    'test_analyzer_mobilenet_depthwise_conv',
+    'test_analyzer_pyramid_dnn',
+    'test_analyzer_text_classification',
+    'test_analyzer_rnn2',
+    'test_analyzer_transformer',
+    'test_analyzer_resnet50',
+    'test_analyzer_ner',
+    'test_analyzer_lac',
+    'test_analyzer_transformer_profile',
+    'test_analyzer_mobilenet_transpose',
+    'test_analyzer_rnn1',
+    'test_analyzer_seq_pool1_profile',
+    'test_analyzer_paddletensor_tensor',
+    'test_analyzer_bert',
+    'test_analyzer_googlenet',
+    'zero_copy_tensor_test',
+    'custom_tensor_test',
+    'test_fleet_base',
+    'test_imperative_container_layerdict',
+    'test_complex_simplenet',
+    'test_tensor_register_hook',
+    'test_set_value_op',
+    'test_tensor_type_promotion',
+    'test_view_op_reuse_allocation',
+    'test_complex_grad_accumulated',
+    'test_sequential',
+    'test_sequential',
+    'test_imperative_layers',
+    'test_dgc_momentum_op',
+    'test_memcpy_op',
+    'test_dgc_op',
+    'test_modelaverage',
+    'test_lookahead',
+    'test_callback_visualdl',
+    'test_new_group_api',
+    'test_collective_split_embedding_none_divisible',
+    'test_collective_wait',
+    'test_collective_split_row_linear',
+    'test_collective_split_col_linear',
+    'test_collective_split_embedding',
+]
+
+# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
+# just remove it from this list.
+TWO_PARALLEL_JOB = [
+    'convert_model2dot_ernie',
+    'im2col_test',
+    'test_elementwise_add_grad_grad',
+    'test_logical_op',
+    'test_imperative_mnist',
+    'test_imperative_deepcf',
+    'test_cholesky_op',
+    'test_multiprocess_dataloader_iterable_dataset_static',
+    'test_sample_logits_op',
+    'test_ir_fc_fuse_pass',
+    'test_imperative_qat_channelwise',
+    'test_fleet_base_single',
+    'test_imperative_out_scale',
+    'test_multiprocess_dataloader_iterable_dataset_dynamic',
+    'test_fill_op',
+    'test_slice_op',
+    'test_cond',
+    'test_compiled_program',
+    'test_lstm',
+    'test_ema',
+    'test_py_reader_using_executor',
+    'test_nan_inf',
+    'test_isinstance',
+    'test_jit_save_load',
+    'test_box_clip_op',
+    'test_group_norm_op',
+    'test_seed_op',
+    'test_activation_nn_grad',
+    'test_pool2d_int8_mkldnn_op',
+    'test_adagrad_op_v2',
+    'test_nn_functional_hot_op',
+    'test_op_name_conflict',
+    'test_imperative_gan',
+    'test_simnet',
+    'test_amp_check_finite_and_scale_op',
+    'test_random_seed',
+    'test_histogram_op',
+    'test_sequence_conv',
+    'test_eye_op',
+    'test_row_conv_op',
+    'test_full_like_op',
+    'test_optimizer_in_control_flow',
+    'test_gru_unit_op',
+    'test_distribute_fpn_proposals_op',
+    'test_log_loss_op',
+    'test_adadelta_op',
+    'test_diag_embed',
+    'test_unsqueeze2_op',
+    'test_fused_fc_elementwise_layernorm_op',
+    'test_sum_bf16_mkldnn_op',
+    'test_sequence_erase_op',
+    'test_sigmoid_cross_entropy_with_logits_op',
+    'test_regularizer_api',
+    'test_lrn_op',
+    'test_parallel_ssa_graph_inference_feed_partial_data',
+    'test_lod_reset_op',
+    'test_install_check',
+    'test_anchor_generator_op',
+    'test_imperative_ptb_rnn',
+    'test_gather_nd_op',
+    'test_flatten_contiguous_range_op',
+    'test_network_with_dtype',
+    'test_elementwise_sub_op',
+    'test_assert_op',
+    'test_elementwise_div_op',
+    'test_gather_tree_op',
+    'test_decoupled_py_reader',
+    'test_imperative_named_members',
+    'test_seqconv_eltadd_relu_fuse_pass',
+    'test_analysis_predictor',
+    'test_convert_operators',
+    'test_add_reader_dependency',
+    'test_is_tensor',
+    'test_variable',
+    'test_save_model_without_var',
+    'test_unfold_op',
+    'test_conv_bn_fuse_pass',
+    'test_truncated_gaussian_random_op',
+    'test_tree_conv_op',
+    'test_traced_layer_err_msg',
+    'test_unique_with_counts',
+    'test_auc_single_pred_op',
+    'test_stack_op',
+    'test_conv_bn_fuse_pass',
+    'test_instance_norm_op_v2',
+    'test_softmax_bf16_mkldnn_op',
+    'test_mean_iou',
+    'test_sequence_slice_op',
+    'test_polygon_box_transform',
+    'test_sequence_pad_op',
+    'test_sequence_expand',
+    'test_cudnn_grucell',
+    'test_pool2d_bf16_mkldnn_op',
+    'test_bilinear_api',
+    'test_parallel_executor_inference_feed_partial_data',
+    'test_initializer_nn',
+    'test_modified_huber_loss_op',
+    'test_lookup_table_op',
+    'test_conv1d_layer',
+    'test_kron_op',
+    'test_isfinite_v2_op',
+    'test_ctc_align',
+    'test_imperative_save_load_v2',
+    'test_decayed_adagrad_op',
+    'test_generator_dataloader',
+    'test_dropout_op',
+    'test_functional_conv3d',
+    'test_executor_return_tensor_not_overwriting',
+    'test_flatten2_op',
+    'test_fsp_op',
+    'test_fusion_transpose_flatten_concat_op',
+    'test_elementwise_nn_grad',
+    'test_hinge_loss_op',
+    'test_elementwise_add_mkldnn_op',
+    'test_optimizer',
+    'test_deformable_conv_op',
+    'test_py_reader_push_pop',
+    'test_random_crop_op',
+    'test_shuffle_channel_op',
+    'test_center_loss',
+    'test_temporal_shift_op',
+    'test_case',
+    'test_transformer_api',
+    'test_bmm_op',
+    'test_adagrad_op',
+    'test_batch_norm_mkldnn_op',
+    'test_adam_op_multi_thread',
+    'test_adamax_op',
+    'test_while_loop_op',
+    'test_transpose_flatten_concat_fuse_pass',
+    'test_trace_op',
+    'test_backward',
+    'test_top_k_op',
+    'test_batch_fc_op',
+    'test_tensor_scalar_type_promotion_static',
+    'test_squared_l2_distance_op',
+    'test_bicubic_interp_op',
+    'test_spp_op',
+    'test_space_to_depth_op',
+    'test_callbacks',
+    'test_sigmoid_focal_loss_op',
+    'test_collect_fpn_proposals_op',
+    'test_sgd_op',
+    'test_sequence_unpad_op',
+    'test_conv1d_transpose_layer',
+    'test_sequence_slice_op',
+    'test_sequence_pool',
+    'test_conv_elementwise_add_fuse_pass',
+    'test_sequence_pad_op',
+    'test_conv_shift_op',
+    'test_sequence_expand_as',
+    'test_cos_sim_op',
+    'test_sequence_enumerate_op',
+    'test_cross_entropy2_op',
+    'test_sequence_concat',
+    'test_cudnn_lstmcell',
+    'test_data_norm_op',
+    'test_decoupled_py_reader_data_check',
+    'test_deformable_conv_v1_op',
+    'test_roi_align_op',
+    'test_detach',
+    'test_rnn_cells',
+    'test_elementwise_floordiv_op',
+    'test_elementwise_min_op',
+    'test_reduce_op',
+    'test_embedding_id_stop_gradient',
+    'test_empty_op',
+    'test_py_reader_combination',
+    'test_ptb_lm',
+    'test_expand_op',
+    'test_prroi_pool_op',
+    'test_fake_dequantize_op',
+    'test_fetch_feed',
+    'test_prelu_op',
+    'test_fill_zeros_like_op',
+    'test_pool2d_op',
+    'test_for_enumerate',
+    'test_gather_op',
+    'test_partial_concat_op',
+    'test_gaussian_random_op',
+    'test_generate_proposals_v2_op',
+    'test_pad_constant_like',
+    'test_grid_sample_function',
+    'test_pad2d_op',
+    'test_huber_loss_op',
+    'test_one_hot_op',
+    'test_normal',
+    'test_imperative_auto_prune',
+    'test_nn_grad',
+    'test_nearest_interp_op',
+    'test_minus_op',
+    'test_imperative_reinforcement',
+    'test_maxout_op',
+    'test_matmul_op',
+    'test_increment',
+    'test_masked_select_op',
+    'test_lstmp_op',
+    'test_loop',
+    'test_label_smooth_op',
+    'test_logsumexp',
+    'test_log_softmax',
+    'test_learning_rate_scheduler',
+    'test_linspace',
+    'test_linear_interp_op',
+    'test_layer_norm_op_v2',
+    'test_lamb_op',
+    'test_lookup_table_v2_op',
+    'test_l1_norm_op',
+    'test_lstm_op',
+    'test_margin_rank_loss_op',
+    'test_index_sample_op',
+    'test_imperative_static_runner_while',
+    'test_imperative_save_load',
+    'test_imperative_ptb_rnn_sorted_gradient',
+    'test_mul_op',
+    'test_imperative_lod_tensor_to_selected_rows',
+    'test_imperative_data_parallel',
+    'test_norm_nn_grad',
+    'test_im2sequence_op',
+    'test_if_else_op',
+    'test_one_hot_v2_op',
+    'test_grid_sampler_op',
+    'test_pad_op',
+    'test_generate_proposals_op',
+    'test_parameter',
+    'test_gaussian_random_mkldnn_op',
+    'test_partial_sum_op',
+    'test_ftrl_op',
+    'test_flip',
+    'test_pool_max_op',
+    'test_prior_box_op',
+    'test_fake_quantize_op',
+    'test_proximal_gd_op',
+    'test_expand_v2_op',
+    'test_psroi_pool_op',
+    'test_expand_as_v2_op',
+    'test_ptb_lm_v2',
+    'test_rand_op',
+    'test_empty_like_op',
+    'test_rank_loss_op',
+    'test_elementwise_mod_op',
+    'test_reinforcement_learning',
+    'test_elementwise_max_op',
+    'test_retain_graph',
+    'test_edit_distance_op',
+    'test_reverse_op',
+    'test_device_guard',
+    'test_rnn_cells_static',
+    'test_deformable_psroi_pooling',
+    'test_roi_perspective_transform_op',
+    'test_segment_ops',
+    'test_cvm_op',
+    'test_selu_op',
+    'test_cross_op',
+    'test_sequence_conv',
+    'test_crop_tensor_op',
+    'test_sequence_expand',
+    'test_sequence_mask',
+    'test_conv_nn_grad',
+    'test_sequence_pool',
+    'test_conv_elementwise_add2_act_fuse_pass',
+    'test_sequence_reshape',
+    'test_conv2d_fusion_op',
+    'test_sequence_softmax_op',
+    'test_sequence_unpad_op',
+    'test_compare_reduce_op',
+    'test_clip_by_norm_op',
+    'test_box_coder_op',
+    'test_smooth_l1_loss_op',
+    'test_bilinear_interp_op',
+    'test_spectral_norm_op',
+    'test_sum_mkldnn_op',
+    'test_batch_norm_op',
+    'test_base_layer',
+    'test_argsort_op',
+    'test_arg_min_max_op',
+    'test_transpose_op',
+    'test_affine_grid_op',
+    'test_unpool_op',
+    'test_addmm_op',
+    'test_adam_optimizer_fp32_fp64',
+    'test_auc_op',
+    'test_adam_op',
+    'test_bilinear_tensor_product_op',
+    'test_break_continue',
+    'test_transpose_mkldnn_op',
+    'test_callback_reduce_lr_on_plateau',
+    'test_cast_op',
+    'test_scatter_nd_op',
+    'test_conv2d_transpose_op_depthwise_conv',
+    'test_queue',
+    'test_cross_entropy_op',
+    'test_detection',
+    'test_elementwise_mul_mkldnn_op',
+    'test_grid_generator',
+    'test_functional_conv2d',
+    'test_fit_a_line',
+    'test_fill_any_like_op',
+    'test_functional_conv2d_transpose',
+    'test_functional_conv3d_transpose',
+    'test_dot_op',
+    'test_gru_op',
+    'test_device',
+    'test_imperative_layer_apply',
+    'test_dataloader_early_reset',
+    'test_imperative_selected_rows_to_lod_tensor',
+    'test_crop_op',
+    'test_linear_interp_v2_op',
+    'test_lr_scheduler',
+    'test_tensor_array_to_tensor',
+    'test_mean_op',
+    'test_momentum_op',
+    'test_iou_similarity_op',
+    'test_optimizer_grad',
+    'test_dygraph_weight_norm',
+    'test_batch_norm_op_v2',
+    'test_pool2d_mkldnn_op',
+    'test_regularizer',
+    'test_sequence_concat',
+    'test_sequence_expand_as',
+    'test_sequence_reverse',
+    'test_shape_op',
+    'test_diag',
+    'test_strided_slice_op',
+    'test_switch_case',
+    'test_target_assign_op',
+    'test_translated_layer',
+    'test_isfinite_op',
+    'test_conv_elementwise_add_act_fuse_pass',
+    'test_unbind_op',
+    'test_size_op',
+    'test_unique',
+    'test_unstack_op',
+    'test_wrappers',
+    'test_deprecated_decorator',
+    'test_affine_channel_op',
+    'test_arange',
+    'test_lrn_mkldnn_op',
+    'test_imperative_gnn',
+    'test_dequantize_abs_max_op',
+    'test_elementwise_mul_op',
+    'test_tensor_scalar_type_promotion_dynamic',
+    'test_fc_op',
+    'test_mish_op',
+    'test_flatten_op',
+    'test_gradient_clip',
+    'test_allclose_layer',
+    'test_meshgrid_op',
+    'test_get_places_op',
+    'test_reader_reset',
+    'test_squared_l2_norm_op',
+    'test_softmax_mkldnn_op',
+    'test_numel_op',
+    'test_squeeze2_op',
+    'test_dygraph_mnist_fp16',
+    'test_activation_mkldnn_op',
+    'test_imperative_layer_children',
+    'test_nearest_interp_v2_op',
+    'test_fill_zeros_like2_op',
+    'test_sync_batch_norm_op',
+    'test_static_save_load',
+    'test_coalesce_tensor_op',
+    'test_fuse_bn_act_pass',
+    'test_simnet_v2',
+    'test_shard_index_op',
+    'test_cuda_random_seed',
+    'test_dequantize_log_op',
+    'test_mkldnn_batch_norm_act_fuse_pass',
+    'test_imperative_skip_op',
+    'test_proximal_adagrad_op',
+    'test_word2vec',
+    'test_conv2d_transpose_mkldnn_op',
+    'test_imperative_optimizer',
+    'test_assign_value_op',
+    'test_roi_pool_op',
+    'test_imperative_basic',
+    'test_word2vec',
+    'test_manual_seed',
+    'test_buffer_shared_memory_reuse_pass',
+    'test_range',
+    'test_box_decoder_and_assign_op',
+    'test_imperative_optimizer_v2',
+    'test_python_operator_overriding',
+    'test_is_empty_op',
+    'test_imperative_qat',
+    'test_py_reader_pin_memory',
+    'test_train_recognize_digits',
+    'test_parallel_executor_feed_persistable_var',
+    'test_mnist',
+    'test_update_loss_scaling_op',
+    'test_rnn_cell_api',
+    'test_parallel_executor_fetch_isolated_var',
+    'test_imperative_load_static_param',
+    'test_fuse_bn_add_act_pass',
+    'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass',
+    'test_quantize_transpiler_v2',
+    'paddle_infer_api_test',
+    'test_analyzer_ernie',
+    'lite_resnet50_test',
+    'lite_mul_model_test',
 ]
 
 
 def main():
-    eight_parallel_job = '^job$'
+    cpu_parallel_job = '^job$'
     tetrad_parallel_job = '^job$'
-    non_parallel_job_1 = '^job$'
-    non_parallel_job_2 = '^job$'
+    two_parallel_job = '^job$'
+    non_parallel_job = '^job$'
 
     test_cases = sys.argv[1]
     test_cases = test_cases.split("\n")
-    for unittest in test_cases:
-        if unittest in CPU_PARALLEL_JOB:
-            eight_parallel_job = eight_parallel_job + '|^' + unittest + '$'
-            continue
-        if unittest in TETRAD_PARALLEL_JOB:
+
+    for unittest in CPU_PARALLEL_JOB:
+        if unittest in test_cases:
+            cpu_parallel_job = cpu_parallel_job + '|^' + unittest + '$'
+            test_cases.remove(unittest)
+
+    for unittest in TETRAD_PARALLEL_JOB:
+        if unittest in test_cases:
             tetrad_parallel_job = tetrad_parallel_job + '|^' + unittest + '$'
-            continue
+            test_cases.remove(unittest)
 
-        if len(non_parallel_job_1) < 10000:
-            non_parallel_job_1 = non_parallel_job_1 + '|^' + unittest + '$'
-        else:
-            non_parallel_job_2 = non_parallel_job_2 + '|^' + unittest + '$'
+    for unittest in TWO_PARALLEL_JOB:
+        if unittest in test_cases:
+            two_parallel_job = two_parallel_job + '|^' + unittest + '$'
+            test_cases.remove(unittest)
+
+    for unittest in test_cases:
+        non_parallel_job = non_parallel_job + '|^' + unittest + '$'
 
-    non_parallel_job = ",".join([non_parallel_job_1, non_parallel_job_2])
-    print("{};{};{}".format(eight_parallel_job, tetrad_parallel_job,
-                            non_parallel_job))
+    print("{};{};{};{}".format(cpu_parallel_job, tetrad_parallel_job,
+                               two_parallel_job, non_parallel_job))
 
 
 if __name__ == '__main__':
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index a18774a8b57b6424e0d89188c537a2086f5aa183..6de9d84379fea595aa7497f9d22ce66f0e9f0c6f 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -25,26 +25,30 @@ import collections
 import sys
 import pydoc
 import hashlib
-import six
+import platform
 import functools
-import logging
 
 member_dict = collections.OrderedDict()
 
 visited_modules = set()
 
-# APIs that should not be printed into API.spec 
-omitted_list = [
-    "paddle.fluid.LoDTensor.set",  # Do not know why it should be omitted
-    "paddle.fluid.io.ComposeNotAligned",
-    "paddle.fluid.io.ComposeNotAligned.__init__",
-]
-
 
 def md5(doc):
-    hash = hashlib.md5()
-    hash.update(str(doc).encode('utf-8'))
-    return hash.hexdigest()
+    try:
+        hashinst = hashlib.md5()
+        if platform.python_version()[0] == "2":
+            hashinst.update(str(doc))
+        else:
+            hashinst.update(str(doc).encode('utf-8'))
+        md5sum = hashinst.hexdigest()
+    except UnicodeDecodeError as e:
+        md5sum = None
+        print(
+            "Error({}) occurred when `md5({})`, discard it.".format(
+                str(e), doc),
+            file=sys.stderr)
+
+    return md5sum
 
 
 def get_functools_partial_spec(func):
@@ -74,13 +78,28 @@ def format_spec(spec):
 
 
 def queue_dict(member, cur_name):
-    if cur_name in omitted_list:
-        return
-
-    doc_md5 = md5(member.__doc__)
-
-    if inspect.isclass(member):
+    if cur_name != 'paddle':
+        try:
+            eval(cur_name)
+        except (AttributeError, NameError, SyntaxError) as e:
+            print(
+                "Error({}) occurred when `eval({})`, discard it.".format(
+                    str(e), cur_name),
+                file=sys.stderr)
+            return
+
+    if (inspect.isclass(member) or inspect.isfunction(member) or
+            inspect.ismethod(member)) and hasattr(
+                member, '__module__') and hasattr(member, '__name__'):
         args = member.__module__ + "." + member.__name__
+        try:
+            eval(args)
+        except (AttributeError, NameError, SyntaxError) as e:
+            print(
+                "Error({}) occurred when `eval({})`, discard it for {}.".format(
+                    str(e), args, cur_name),
+                file=sys.stderr)
+            return
     else:
         try:
             args = inspect.getargspec(member)
@@ -95,6 +114,7 @@ def queue_dict(member, cur_name):
         if not has_type_error:
             args = format_spec(args)
 
+    doc_md5 = md5(member.__doc__)
     member_dict[cur_name] = "({}, ('document', '{}'))".format(args, doc_md5)
 
 
@@ -106,8 +126,7 @@ def visit_member(parent_name, member, member_name=None):
     if inspect.isclass(member):
         queue_dict(member, cur_name)
         for name, value in inspect.getmembers(member):
-            if hasattr(value, '__name__') and (not name.startswith("_") or
-                                               name == "__init__"):
+            if hasattr(value, '__name__') and not name.startswith("_"):
                 visit_member(cur_name, value)
     elif inspect.ismethoddescriptor(member):
         return
@@ -123,7 +142,7 @@ def visit_member(parent_name, member, member_name=None):
 
 
 def is_primitive(instance):
-    int_types = (int, long) if six.PY2 else (int, )
+    int_types = (int, long) if platform.python_version()[0] == "2" else (int, )
     pritimitive_types = int_types + (float, str)
     if isinstance(instance, pritimitive_types):
         return True
@@ -149,11 +168,14 @@ def visit_all_module(mod):
         return
 
     visited_modules.add(mod)
-
-    for member_name in (
-            name
-            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
-            if not name.startswith("_")):
+    if hasattr(mod, "__all__"):
+        member_names = (name for name in mod.__all__
+                        if not name.startswith("_"))
+    elif mod_name == 'paddle':
+        member_names = dir(mod)
+    else:
+        return
+    for member_name in member_names:
         instance = getattr(mod, member_name, None)
         if instance is None:
             continue
@@ -168,17 +190,20 @@ def visit_all_module(mod):
             visit_all_module(instance)
         else:
             if member_name != instance.__name__:
-                logging.warn(
+                print(
                     "Found alias API, alias name is: {}, original name is: {}".
-                    format(member_name, instance.__name__))
+                    format(member_name, instance.__name__),
+                    file=sys.stderr)
                 visit_member(mod.__name__, instance, member_name)
             else:
                 visit_member(mod.__name__, instance)
 
 
-modules = sys.argv[1].split(",")
-for m in modules:
-    visit_all_module(importlib.import_module(m))
+if __name__ == '__main__':
+    import paddle
+    modules = sys.argv[1].split(",")
+    for m in modules:
+        visit_all_module(importlib.import_module(m))
 
-for name in member_dict:
-    print(name, member_dict[name])
+    for name in member_dict:
+        print(name, member_dict[name])
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index ce0490d487fbe7798cba06e7ff0c11b457a18979..52777cd59ba253b8801bf058899570b4770ca724 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -19,13 +19,15 @@ import multiprocessing
 import math
 import platform
 import inspect
-import paddle
-import paddle.fluid
 import json
+import argparse
+import shutil
+import re
+import logging
 """
 please make sure to run in the tools path
-usage: python sample_test.py {arg1} 
-arg1: the first arg defined running in gpu version or cpu version
+usage: python sample_test.py {cpu or gpu} 
+    {cpu or gpu}: running in cpu version or gpu version
 
 for example, you can run cpu version python2 testing like this:
 
@@ -33,6 +35,24 @@ for example, you can run cpu version python2 testing like this:
 
 """
 
+logger = logging.getLogger()
+if logger.handlers:
+    console = logger.handlers[
+        0]  # we assume the first handler is the one we want to configure
+else:
+    console = logging.StreamHandler()
+    logger.addHandler(console)
+console.setFormatter(logging.Formatter("%(message)s"))
+
+RUN_ON_DEVICE = 'cpu'
+GPU_ID = 0
+methods = []
+whl_error = []
+API_DEV_SPEC_FN = 'paddle/fluid/API_DEV.spec'
+API_PR_SPEC_FN = 'paddle/fluid/API_PR.spec'
+API_DIFF_SPEC_FN = 'dev_pr_diff_api.spec'
+SAMPLECODE_TEMPDIR = 'samplecode_temp'
+
 
 def find_all(srcstr, substr):
     """
@@ -83,11 +103,9 @@ def check_indent(cdline):
     return indent
 
 
-# srccom: raw comments in the source,including ''' and original indent
-def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
+def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
     """
-    Extract and run sample codes from source comment and
-    the result will be returned.
+    Extract sample codes from __doc__, and write them to files.
 
     Args:
         srccom(str): the source comment of some API whose
@@ -97,36 +115,16 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
         hname(str): the name of the hint  banners , e.t. def hname.
 
     Returns:
-        result: True or False
+        sample_code_filenames(list of str)
     """
+    global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR
+    CODE_BLOCK_INTERDUCTORY = "code-block:: python"
 
-    result = True
-
-    def sampcd_header_print(name, sampcd, htype, hname):
-        """
-        print hint banner headers.
-
-        Args:
-            name(str): the name of the API.
-            sampcd(str): sample code string
-            htype(str): the type of hint banners, def/class/method.
-            hname(str): the name of the hint  banners , e.t. def hname.
-            flushed.
-        """
-        print_header(htype, hname)
-        print("Sample code ", str(y), " extracted for ", name, "   :")
-        print(sampcd)
-        print("----example code check----\n")
-        print("executing sample code .....")
-        print("execution result:")
-
-    sampcd_begins = find_all(srccom, " code-block:: python")
+    sampcd_begins = find_all(srccom, CODE_BLOCK_INTERDUCTORY)
     if len(sampcd_begins) == 0:
-        print_header(htype, hname)
-        '''
-        detect sample codes using >>> to format
-        and consider this situation as wrong
-        '''
+        # detect sample codes using >>> to format and consider this situation as wrong
+        print(htype, " name:", hname)
+        print("-----------------------")
         if srccom.find("Examples:") != -1:
             print("----example code check----\n")
             if srccom.find(">>>") != -1:
@@ -134,14 +132,14 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
                     "Deprecated sample code style:\n\n    Examples:\n\n        >>>codeline\n        >>>codeline\n\n\n ",
                     "Please use '.. code-block:: python' to ",
                     "format sample code.\n")
-                result = False
+                return []
         else:
             print("Error: No sample code!\n")
-            result = False
-
+            return []
+    sample_code_filenames = []
     for y in range(1, len(sampcd_begins) + 1):
         sampcd_begin = sampcd_begins[y - 1]
-        sampcd = srccom[sampcd_begin + len(" code-block:: python") + 1:]
+        sampcd = srccom[sampcd_begin + len(CODE_BLOCK_INTERDUCTORY) + 1:]
         sampcd = sampcd.split("\n")
         # remove starting empty lines
         while sampcd[0].replace(' ', '').replace('\t', '') == '':
@@ -164,381 +162,162 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
                 sampcd_to_write.append(cdline[min_indent:])
 
         sampcd = '\n'.join(sampcd_to_write)
-        if sys.argv[1] == "cpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
-        if sys.argv[1] == "gpu":
-            sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = "0"\n' + sampcd
+        if RUN_ON_DEVICE == "cpu":
+            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd
+        if RUN_ON_DEVICE == "gpu":
+            sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
+                GPU_ID) + sampcd
         sampcd += '\nprint(' + '\"' + name + ' sample code is executed successfully!\")'
 
-        if len(sampcd_begins) > 1:
-            tfname = name + "_example_" + str(y) + ".py"
-        else:
-            tfname = name + "_example" + ".py"
-        tempf = open("samplecode_temp/" + tfname, 'w')
-        tempf.write(sampcd)
-        tempf.close()
-        if platform.python_version()[0] == "2":
-            cmd = ["python", "samplecode_temp/" + tfname]
-        elif platform.python_version()[0] == "3":
-            cmd = ["python3", "samplecode_temp/" + tfname]
-        else:
-            print("Error: fail to parse python version!")
-            result = False
-            exit(1)
-
-        subprc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output, error = subprc.communicate()
-        msg = "".join(output.decode(encoding='utf-8'))
-        err = "".join(error.decode(encoding='utf-8'))
-
-        if subprc.returncode != 0:
-            print("\nSample code error found in ", name, ":\n")
-            sampcd_header_print(name, sampcd, htype, hname)
-            print("subprocess return code: ", str(subprc.returncode))
-            print("Error Raised from Sample Code ", name, " :\n")
-            print(err)
-            print(msg)
-            result = False
-        # msg is the returned code execution report
-        #os.remove("samplecode_temp/" + tfname)
-
-    return result
+        tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
+            name, '.py' if len(sampcd_begins) == 1 else '_{}.py'.format(y)))
+        with open(tfname, 'w') as tempf:
+            tempf.write(sampcd)
+        sample_code_filenames.append(tfname)
+    return sample_code_filenames
 
 
-def single_defcom_extract(start_from, srcls, is_class_begin=False):
+def execute_samplecode(tfname):
     """
-    to extract a def function/class/method comments body
+    Execute a sample-code test.
 
     Args:
-        start_from(int): the line num of "def" header
-        srcls(list): the source file in lines
-        is_class_begin(bool): whether the start_from is a beginning a class. \
-        For a sole class body itself may end up with its method if it has no
-        docstring. But the body of \
-        a common def function can only be ended up by a none-indented def/class
-
+        tfname: the filename of the samplecode.
+    
     Returns:
-        string : the extracted comment body, inclusive of its quote marks.
-
+        result: success or not
+        tfname: same as the input argument
+        msg: the stdout output of the samplecode executing.
     """
+    result = True
+    msg = None
+    if platform.python_version()[0] in ["2", "3"]:
+        cmd = [sys.executable, tfname]
+    else:
+        print("Error: fail to parse python version!")
+        result = False
+        exit(1)
 
-    i = start_from
-    fcombody = ""  # def comment body
-    comstart = -1  # the starting line index of comment mark "'''" or """"""
-    # if it is not -1, it indicates the loop is in the comment body
-    comstyle = 0  # comment mark style ,comments quoted with ''' is coded as 1
-    # comments quoted with """ is coded as 2
-    for x in range(i + 1, len(srcls)):
-        if is_class_begin:
-            if srcls[x].replace('\t', '    ').startswith('    def '):
-                break
-        if srcls[x].startswith('def ') or srcls[x].startswith('class '):
-            break
-        else:
-            if comstart == -1:
-                s = srcls[x].replace(" ", '').replace("\t",
-                                                      '').replace("\n", '')
-                if s.startswith("\"\"\"") or s.startswith("r\"\"\""):
-                    comstart = x
-                    comstyle = 2
-                    continue
-            if (comstyle == 2 and comstart != -1 and
-                    srcls[x].replace(" ", '').replace("\t", '').replace(
-                        "\n", '').startswith("\"\"\"")):
-                break
-            if comstart == -1:
-                s = srcls[x].replace(" ", '').replace("\t",
-                                                      '').replace("\n", '')
-                if s.startswith("\'\'\'") or s.startswith("r\'\'\'"):
-                    comstart = x
-                    comstyle = 1
-                    continue
-            if (comstyle == 1 and comstart != -1 and
-                    srcls[x].replace(" ", '').replace("\t", '').replace(
-                        "\n", '').startswith("\'\'\'")):
-                break
-            if (comstart !=
-                    -1):  # when the comments start, begin to add line to fcombody
-                fcombody += srcls[x]
-    return fcombody
-
-
-def print_header(htype, name):
-    print(htype, " name:", name)
-    print("-----------------------")
-
-
-def srccoms_extract(srcfile, wlist):
-    """
-    Given a source file ``srcfile``, this function will
-    extract its API(doc comments) and run sample codes in the
-    API.
-
-    Args:
-        srcfile(file): the source file
-        wlist(list): white list
-
-    Returns:
-        result: True or False
-    """
+    # check required envisonment
+    with open(tfname, 'r') as f:
+        for line in f.readlines():
+            if re.match(r'#\s*required\s*:\s*(distributed|gpu|skip)', line):
+                result = True
+                return result, tfname, '{} is skipped. cause: {}'.format(tfname,
+                                                                         line)
+
+    logging.info('running %s', tfname)
+    print("\n----example code check----")
+    print("executing sample code .....", tfname)
+    subprc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = subprc.communicate()
+    msg = "".join(output.decode(encoding='utf-8'))
+    err = "".join(error.decode(encoding='utf-8'))
+
+    if subprc.returncode != 0:
+        print("Sample code error found in ", tfname, ":")
+        print("-----------------------")
+        print(open(tfname).read())
+        print("-----------------------")
+        print("subprocess return code: ", str(subprc.returncode))
+        print("Error Raised from Sample Code ", tfname, " :")
+        print(err)
+        print(msg)
+        print("----example code check failed----\n")
+        logging.warning('%s error: %s', tfname, err)
+        logging.warning('%s msg: %s', tfname, msg)
+        result = False
+    else:
+        print("----example code check success----\n")
 
-    process_result = True
-    srcc = srcfile.read()
-    # 2. get defs and classes header line number
-    # set file pointer to its beginning
-    srcfile.seek(0, 0)
-    srcls = srcfile.readlines()  # source lines
-
-    # 1. fetch__all__ list
-    allidx = srcc.find("__all__")
-    srcfile_new = srcfile.name
-    srcfile_new = srcfile_new.replace('.py', '')
-    srcfile_list = srcfile_new.split('/')
-    srcfile_str = ''
-    for i in range(4, len(srcfile_list)):
-        srcfile_str = srcfile_str + srcfile_list[i] + '.'
-    if allidx != -1:
-        alllist = []
-        # get all list for layers/ops.py
-        if srcfile.name.find("ops.py") != -1:
-            for ai in range(0, len(srcls)):
-                if srcls[ai].startswith("__all__"):
-                    lb = srcls[ai].find('[')
-                    rb = srcls[ai].find(']')
-                    if lb == -1:
-                        continue
-                    allele = srcls[ai][lb + 1:rb].replace("'", '').replace(
-                        " ", '').replace("\"", '')
-                    alllist.append(allele)
-            if '' in alllist:
-                alllist.remove('')
-        else:
-            alllist_b = allidx + len("__all__")
-            allstr = srcc[alllist_b + srcc[alllist_b:].find("[") + 1:alllist_b +
-                          srcc[alllist_b:].find("]")]
-            allstr = allstr.replace("\n", '').replace(" ", '').replace(
-                "'", '').replace("\"", '')
-            alllist = allstr.split(',')
-            if '' in alllist:
-                alllist.remove('')
-        api_alllist_count = len(alllist)
-        api_count = 0
-        handled = []
-        # get src contents in layers/ops.py
-        if srcfile.name.find("ops.py") != -1:
-            for i in range(0, len(srcls)):
-                if srcls[i].find("__doc__") != -1:
-                    opname = srcls[i][:srcls[i].find("__doc__") - 1]
-                    if opname in wlist:
-                        continue
-                    comstart = i
-                    for j in range(i, len(srcls)):
-                        if srcls[j].find("\"\"\"") != -1:
-                            comstart = i
-                    opcom = ""
-                    for j in range(comstart + 1, len(srcls)):
-                        opcom += srcls[j]
-                        if srcls[j].find("\"\"\"") != -1:
-                            break
-                    api_count += 1
-                    handled.append(
-                        opname)  # ops.py also has normal formatted functions
-                    # use list 'handled'  to mark the functions have been handled here
-                    # which will be ignored in the following step
-        for i in range(0, len(srcls)):
-            if srcls[i].startswith(
-                    'def '):  # a function header is detected in line i
-                f_header = srcls[i].replace(" ", '')
-                fn = f_header[len('def'):f_header.find('(')]  # function name
-                if "%s%s" % (srcfile_str, fn) not in methods:
-                    continue
-                if fn in handled:
-                    continue
-                if fn in alllist:
-                    api_count += 1
-                    if fn in wlist or fn + "@" + srcfile.name in wlist:
-                        continue
-                    fcombody = single_defcom_extract(i, srcls)
-                    if fcombody == "":  # if no comment
-                        print_header("def", fn)
-                        print("WARNING: no comments in function ", fn,
-                              ", but it deserves.")
-                        continue
-                    else:
-                        if not sampcd_extract_and_run(fcombody, fn, "def", fn):
-                            process_result = False
-
-            if srcls[i].startswith('class '):
-                c_header = srcls[i].replace(" ", '')
-                cn = c_header[len('class'):c_header.find('(')]  # class name
-                if '%s%s' % (srcfile_str, cn) not in methods:
-                    continue
-                if cn in handled:
-                    continue
-                if cn in alllist:
-                    api_count += 1
-                    if cn in wlist or cn + "@" + srcfile.name in wlist:
-                        continue
-                    # class comment
-                    classcom = single_defcom_extract(i, srcls, True)
-                    if classcom != "":
-                        if not sampcd_extract_and_run(classcom, cn, "class",
-                                                      cn):
-
-                            process_result = False
-                    else:
-                        print("WARNING: no comments in class itself ", cn,
-                              ", but it deserves.\n")
-                    # handling methods in class bodies
-                    for x in range(
-                            i + 1,
-                            len(srcls)):  # from the next line of class header
-                        if (srcls[x].startswith('def ') or
-                                srcls[x].startswith('class ')):
-                            break
-                        else:
-                            # member method def header
-                            srcls[x] = srcls[x].replace('\t', '    ')
-                            if (srcls[x].startswith(
-                                    '    def ')):  # detect a mehtod header..
-                                thisl = srcls[x]
-                                indent = len(thisl) - len(thisl.lstrip())
-                                mn = thisl[indent + len('def '):thisl.find(
-                                    '(')]  # method name
-                                name = cn + "." + mn  # full name
-                                if '%s%s' % (
-                                        srcfile_str, name
-                                ) not in methods:  # class method not in api.spec 
-                                    continue
-                                if mn.startswith('_'):
-                                    continue
-                                if name in wlist or name + "@" + srcfile.name in wlist:
-                                    continue
-                                thismethod = [thisl[indent:]
-                                              ]  # method body lines
-                                # get all the lines of a single method body
-                                # into thismethod(list)
-                                # and send it to single_defcom_extract
-                                for y in range(x + 1, len(srcls)):
-                                    srcls[y] = srcls[y].replace('\t', '    ')
-                                    if (srcls[y].startswith('def ') or
-                                            srcls[y].startswith('class ')):
-                                        # end of method
-                                        break
-                                    elif srcls[y].startswith('    def '):
-                                        # end of method
-                                        break
-                                    else:
-                                        thismethod.append(srcls[y][indent:])
-                                thismtdcom = single_defcom_extract(0,
-                                                                   thismethod)
-                                if thismtdcom != "":
-                                    if not sampcd_extract_and_run(
-                                            thismtdcom, name, "method", name):
-                                        process_result = False
-
-    return process_result
-
-
-def test(file_list):
-    process_result = True
-    for file in file_list:
-        with open(file, 'r') as src:
-            if not srccoms_extract(src, wlist):
-                process_result = False
-    return process_result
+    # msg is the returned code execution report
+    return result, tfname, msg
 
 
 def get_filenames():
     '''
-    this function will get the modules that pending for check.
+    this function will get the sample code files that pending for check.
 
     Returns:
 
-        list: the modules pending for check .
+        dict: the sample code files pending for check .
 
     '''
-    filenames = []
-    global methods
+    global methods  # write
     global whl_error
-    methods = []
+    import paddle
     whl_error = []
     get_incrementapi()
-    API_spec = 'dev_pr_diff_api.spec'
-    with open(API_spec) as f:
+    all_sample_code_filenames = {}
+    with open(API_DIFF_SPEC_FN) as f:
         for line in f.readlines():
             api = line.replace('\n', '')
             try:
-                module = eval(api).__module__
+                api_obj = eval(api)
             except AttributeError:
                 whl_error.append(api)
                 continue
-            if len(module.split('.')) > 1:
-                filename = '../python/'
-                module_py = '%s.py' % module.split('.')[-1]
-                for i in range(0, len(module.split('.')) - 1):
-                    filename = filename + '%s/' % module.split('.')[i]
-                filename = filename + module_py
-            else:
-                filename = ''
-                print("\nWARNING:----Exception in get api filename----\n")
-                print("\n" + api + ' module is ' + module + "\n")
-            if filename != '' and filename not in filenames:
-                filenames.append(filename)
-            # get all methods
-            method = ''
-            if inspect.isclass(eval(api)):
-                name = api.split('.')[-1]
-            elif inspect.isfunction(eval(api)):
-                name = api.split('.')[-1]
-            elif inspect.ismethod(eval(api)):
-                name = '%s.%s' % (api.split('.')[-2], api.split('.')[-1])
-            else:
-                name = ''
-                print("\nWARNING:----Exception in get api methods----\n")
-                print("\n" + line + "\n")
-                print("\n" + api + ' method is None!!!' + "\n")
-            for j in range(2, len(module.split('.'))):
-                method = method + '%s.' % module.split('.')[j]
-            method = method + name
-            if method not in methods:
-                methods.append(method)
-    os.remove(API_spec)
-    return filenames
+            except SyntaxError:
+                logger.warning('line:%s, api:%s', line, api)
+                # paddle.Tensor.<lambda>
+                continue
+            if hasattr(api_obj, '__doc__') and api_obj.__doc__:
+                sample_code_filenames = sampcd_extract_to_file(api_obj.__doc__,
+                                                               api)
+                for tfname in sample_code_filenames:
+                    all_sample_code_filenames[tfname] = api
+    return all_sample_code_filenames
+
+
+def get_api_md5(path):
+    """
+    read the api spec file, and scratch the md5sum value of every api's docstring.
+
+    Args:
+        path: the api spec file. ATTENTION the path relative
+    
+    Returns:
+        api_md5(dict): key is the api's real fullname, value is the md5sum.
+    """
+    api_md5 = {}
+    API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
+                          path)
+    pat = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
+    patArgSpec = re.compile(
+        r'^(paddle[^,]+)\s+\(ArgSpec.*document\W*([0-9a-z]{32})')
+    with open(API_spec) as f:
+        for line in f.readlines():
+            mo = pat.search(line)
+            if not mo:
+                mo = patArgSpec.search(line)
+            if mo:
+                api_md5[mo.group(1)] = mo.group(2)
+    return api_md5
 
 
 def get_incrementapi():
     '''
     this function will get the apis that difference between API_DEV.spec and API_PR.spec.
     '''
-
-    def get_api_md5(path):
-        api_md5 = {}
-        API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
-                              path)
-        with open(API_spec) as f:
-            for line in f.readlines():
-                api = line.split(' ', 1)[0]
-                md5 = line.split("'document', ")[1].replace(')', '').replace(
-                    '\n', '')
-                api_md5[api] = md5
-        return api_md5
-
-    dev_api = get_api_md5('paddle/fluid/API_DEV.spec')
-    pr_api = get_api_md5('paddle/fluid/API_PR.spec')
-    with open('dev_pr_diff_api.spec', 'w') as f:
+    global API_DEV_SPEC_FN, API_PR_SPEC_FN, API_DIFF_SPEC_FN  ## readonly
+    dev_api = get_api_md5(API_DEV_SPEC_FN)
+    pr_api = get_api_md5(API_PR_SPEC_FN)
+    with open(API_DIFF_SPEC_FN, 'w') as f:
         for key in pr_api:
             if key in dev_api:
                 if dev_api[key] != pr_api[key]:
+                    logger.debug("%s in dev is %s, different from pr's %s", key,
+                                 dev_api[key], pr_api[key])
                     f.write(key)
                     f.write('\n')
             else:
+                logger.debug("%s is not in dev", key)
                 f.write(key)
                 f.write('\n')
 
 
-def get_wlist():
+def get_wlist(fn="wlist.json"):
     '''
     this function will get the white list of API.
 
@@ -551,7 +330,7 @@ def get_wlist():
     wlist_file = []
     # only white on CPU
     gpu_not_white = []
-    with open("wlist.json", 'r') as load_f:
+    with open(fn, 'r') as load_f:
         load_dict = json.load(load_f)
         for key in load_dict:
             if key == 'wlist_dir':
@@ -567,84 +346,122 @@ def get_wlist():
     return wlist, wlist_file, gpu_not_white
 
 
-wlist, wlist_file, gpu_not_white = get_wlist()
+arguments = [
+    # flags, dest, type, default, help
+    ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'],
+    ['--logf', 'logf', str, None, 'file for logging'],
+    ['--threads', 'threads', int, 0, 'sub processes number'],
+]
 
-if len(sys.argv) < 2:
-    print("Error: inadequate number of arguments")
-    print('''If you are going to run it on 
-        "CPU: >>> python sampcd_processor.py cpu
-        "GPU: >>> python sampcd_processor.py gpu
-        ''')
-    sys.exit("lack arguments")
-else:
-    if sys.argv[1] == "gpu":
+
+def parse_args():
+    """
+    Parse input arguments
+    """
+    global arguments
+    parser = argparse.ArgumentParser(description='run Sample Code Test')
+    # parser.add_argument('--cpu', dest='cpu_mode', action="store_true",
+    #                     help='Use CPU mode (overrides --gpu)')
+    # parser.add_argument('--gpu', dest='gpu_mode', action="store_true")
+    parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument('mode', type=str, help='run on device', default='cpu')
+    for item in arguments:
+        parser.add_argument(
+            item[0], dest=item[1], help=item[4], type=item[2], default=item[3])
+
+    if len(sys.argv) == 1:
+        args = parser.parse_args(['cpu'])
+        return args
+    #    parser.print_help()
+    #    sys.exit(1)
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.debug:
+        logger.setLevel(logging.DEBUG)
+    if args.logf:
+        logfHandler = logging.FileHandler(args.logf)
+        logfHandler.setFormatter(
+            logging.Formatter(
+                "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s"
+            ))
+        logger.addHandler(logfHandler)
+
+    wlist, wlist_file, gpu_not_white = get_wlist()
+
+    if args.mode == "gpu":
+        GPU_ID = args.gpu_id
+        logger.info("using GPU_ID %d", GPU_ID)
         for _gnw in gpu_not_white:
             wlist.remove(_gnw)
-    elif sys.argv[1] != "cpu":
-        print("Unrecognized argument:'", sys.argv[1], "' , 'cpu' or 'gpu' is ",
-              "desired\n")
+    elif args.mode != "cpu":
+        logger.error("Unrecognized argument:%s, 'cpu' or 'gpu' is desired.",
+                     args.mode)
         sys.exit("Invalid arguments")
-    print("API check -- Example Code")
-    print("sample_test running under python", platform.python_version())
-    if not os.path.isdir("./samplecode_temp"):
-        os.mkdir("./samplecode_temp")
-    cpus = multiprocessing.cpu_count()
+    RUN_ON_DEVICE = args.mode
+    logger.info("API check -- Example Code")
+    logger.info("sample_test running under python %s",
+                platform.python_version())
+
+    if os.path.exists(SAMPLECODE_TEMPDIR):
+        if not os.path.isdir(SAMPLECODE_TEMPDIR):
+            os.remove(SAMPLECODE_TEMPDIR)
+            os.mkdir(SAMPLECODE_TEMPDIR)
+    else:
+        os.mkdir(SAMPLECODE_TEMPDIR)
+
     filenames = get_filenames()
     if len(filenames) == 0 and len(whl_error) == 0:
-        print("-----API_PR.spec is the same as API_DEV.spec-----")
+        logger.info("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
-    rm_file = []
-    for f in filenames:
-        for w_file in wlist_file:
-            if f.startswith(w_file):
-                rm_file.append(f)
-                filenames.remove(f)
-    if len(rm_file) != 0:
-        print("REMOVE white files: %s" % rm_file)
-    print("API_PR is diff from API_DEV: %s" % filenames)
-    one_part_filenum = int(math.ceil(len(filenames) / cpus))
-    if one_part_filenum == 0:
-        one_part_filenum = 1
-    divided_file_list = [
-        filenames[i:i + one_part_filenum]
-        for i in range(0, len(filenames), one_part_filenum)
-    ]
-
-    po = multiprocessing.Pool()
-    results = po.map_async(test, divided_file_list)
+    logger.info("API_PR is diff from API_DEV: %s", filenames)
+
+    threads = multiprocessing.cpu_count()
+    if args.threads:
+        threads = args.threads
+    po = multiprocessing.Pool(threads)
+    results = po.map_async(execute_samplecode, filenames.keys())
     po.close()
     po.join()
 
     result = results.get()
 
     # delete temp files
-    for root, dirs, files in os.walk("./samplecode_temp"):
-        for fntemp in files:
-            os.remove("./samplecode_temp/" + fntemp)
-    os.rmdir("./samplecode_temp")
+    if not args.debug:
+        shutil.rmtree(SAMPLECODE_TEMPDIR)
 
-    print("----------------End of the Check--------------------")
+    logger.info("----------------End of the Check--------------------")
     if len(whl_error) != 0:
-        print("%s is not in whl." % whl_error)
-        print("")
-        print("Please check the whl package and API_PR.spec!")
-        print("You can follow these steps in order to generate API.spec:")
-        print("1. cd ${paddle_path}, compile paddle;")
-        print("2. pip install build/python/dist/(build whl package);")
-        print(
+        logger.info("%s is not in whl.", whl_error)
+        logger.info("")
+        logger.info("Please check the whl package and API_PR.spec!")
+        logger.info("You can follow these steps in order to generate API.spec:")
+        logger.info("1. cd ${paddle_path}, compile paddle;")
+        logger.info("2. pip install build/python/dist/(build whl package);")
+        logger.info(
             "3. run 'python tools/print_signatures.py paddle > paddle/fluid/API.spec'."
         )
         for temp in result:
-            if not temp:
-                print("")
-                print("In addition, mistakes found in sample codes.")
-                print("Please check sample codes.")
-        print("----------------------------------------------------")
+            if not temp[0]:
+                logger.info("In addition, mistakes found in sample codes: %s",
+                            temp[1])
+                logger.info("error_methods: %s", str(temp[2]))
+        logger.info("----------------------------------------------------")
         exit(1)
     else:
+        has_error = False
         for temp in result:
-            if not temp:
-                print("Mistakes found in sample codes.")
-                print("Please check sample codes.")
-                exit(1)
-    print("Sample code check is successful!")
+            if not temp[0]:
+                logger.info("In addition, mistakes found in sample codes: %s",
+                            temp[1])
+                logger.info("error_methods: %s", str(temp[2]))
+                has_error = True
+        if has_error:
+            logger.info("Mistakes found in sample codes.")
+            logger.info("Please check sample codes.")
+            exit(1)
+    logger.info("Sample code check is successful!")
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index dc537cb2684bb91f83c918e603558e597b141ca6..15bcae826064d981eee0972ea10ed1294cd9d5c5 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -21,6 +21,7 @@ STATIC_MODE_TESTING_LIST = [
     'test_linear_chain_crf_op',
     'test_lod_reset_op',
     'test_lookup_table_op',
+    'test_lookup_table_bf16_op',
     'test_pad2d_op',
     'test_scatter_op',
     'test_sequence_concat',
@@ -420,6 +421,8 @@ STATIC_MODE_TESTING_LIST = [
     'test_reader_reset',
     'test_recurrent_op',
     'test_reduce_op',
+    'test_reduce_mkldnn_op',
+    'test_reduce_bf16_mkldnn_op',
     'test_ref_by_trainer_id_op',
     'test_registry',
     'test_regularizer',
@@ -495,6 +498,7 @@ STATIC_MODE_TESTING_LIST = [
     'test_truncated_gaussian_random_op',
     'test_unbind_op',
     'test_unfold_op',
+    'test_uniform_random_bf16_op',
     'test_uniform_random_op',
     'test_unique',
     'test_unique_with_counts',
@@ -600,11 +604,14 @@ STATIC_MODE_TESTING_LIST = [
     'test_fc_mkldnn_op',
     'test_fc_bf16_mkldnn_op',
     'test_nearest_interp_mkldnn_op',
+    'test_nearest_interp_v2_mkldnn_op',
     'test_bilinear_interp_mkldnn_op',
+    'test_bilinear_interp_v2_mkldnn_op',
     'test_fusion_gru_int8_mkldnn_op',
     'test_fusion_gru_bf16_mkldnn_op',
     'test_fusion_gru_mkldnn_op',
     'test_fusion_lstm_mkldnn_op',
+    'test_fusion_lstm_int8_mkldnn_op',
     'test_fusion_lstm_bf16_mkldnn_op',
     'test_gaussian_random_mkldnn_op',
     'test_lrn_mkldnn_op',
@@ -698,4 +705,6 @@ STATIC_MODE_TESTING_LIST = [
     'test_slice_op_xpu',
     'test_generate_proposals_v2_op',
     'test_lamb_op_xpu',
+    'test_model_cast_to_bf16',
+    'test_sgd_op_bf16',
 ]
diff --git a/tools/static_mode_white_list.pyc b/tools/static_mode_white_list.pyc
deleted file mode 100644
index e9012c233595b6844f54e625972360f5aeeb0d3b..0000000000000000000000000000000000000000
Binary files a/tools/static_mode_white_list.pyc and /dev/null differ
diff --git a/tools/statistics_UT_resource.sh b/tools/statistics_UT_resource.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a6f1f264c4cd2b883a649d8b5a93ff9700fb6bb8
--- /dev/null
+++ b/tools/statistics_UT_resource.sh
@@ -0,0 +1,58 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+unset GREP_OPTIONS
+rm ./run_detail.log
+rm ./UT_resource.log
+rm ./UT_resource_sort.log
+rm ./while_list.log
+
+export LD_LIBRARY_PATH="$PWD/python/paddle/libs;$LD_LIBRARY_PATH"
+export CUDA_VISIBLE_DEVICES=0,1
+
+test_cases=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
+use_memory_base=$(nvidia-smi -q -i 0  | grep "Used"  | head -1 | grep -o "[0-9]*")
+for unittest in $test_cases
+do
+    use_memory=0
+    gpu_utilization=0
+    memory_utilization=0
+    ctest -R "^${unittest}$" --repeat-until-fail 5 -j 1 &
+    PID=$!
+    echo -e "******************************************************"
+    echo -e "[$unittest]:    PID:$PID \n"
+    while [[ $(ps aux | awk '{print $2}' | grep "^$PID$" | grep -v "grep" | wc -l) -ge 1 ]]
+    do
+        use_memory_current=$(nvidia-smi -q -i 0  | grep "Used"  | head -1 | grep -o "[0-9]*")
+        if [[ $use_memory_current -gt $use_memory ]];then
+            use_memory=$use_memory_current
+        fi
+        memory_utilization_current=$(nvidia-smi -q -i 0 |  grep "Memory" | sed -n '3p' | grep -o "[0-9]*")
+        if [[ $memory_utilization_current -gt $memory_utilization ]];then
+            memory_utilization=$memory_utilization_current
+        fi
+
+        gpu_utilization_current=$(nvidia-smi -q -i 0  | grep "Gpu"  | grep -o "[0-9]*")
+        if [[ $gpu_utilization_current -gt $gpu_utilization ]];then
+            gpu_utilization=$gpu_utilization_current
+        fi
+    done
+    use_memory=`expr $use_memory - $use_memory_base`
+    echo -e "     use_memory:$use_memory \n     memory_utilization:$memory_utilization \n     gpu_utilization:$gpu_utilization\n"
+    echo -e "[$unittest]: \n     use_memory:$use_memory \n     memory_utilization:$memory_utilization \n     gpu_utilization:$gpu_utilization\n" >> run_detail.log
+    echo -e "$unittest:$use_memory:$memory_utilization:$gpu_utilization" >> UT_resource.log
+done
+
+sort -r -n -k 2 -t : UT_resource.log > UT_resource_sort.log
+cat UT_resource_sort.log | awk -F ':' '{print $1}' > while_list.log
diff --git a/tools/summary_env.py b/tools/summary_env.py
index 38bae87651d4b24fc7377c65f371995c893fda42..d12e644cc28daa14bc7de879bc295ad7b2fc91e5 100644
--- a/tools/summary_env.py
+++ b/tools/summary_env.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 import sys
+import distro
 import platform
 import subprocess
 
@@ -47,8 +48,8 @@ def get_os_info():
         plat = "macOs"
         ver = platform.mac_ver()[0]
     elif platform.system() == "Linux":
-        plat = platform.linux_distribution()[0]
-        ver = platform.linux_distribution()[1]
+        plat = distro.linux_distribution()[0]
+        ver = distro.linux_distribution()[1]
     elif platform.system() == "Windows":
         plat = "Windows"
         ver = platform.win32_ver()[0]
diff --git a/tools/test_model_benchmark.sh b/tools/test_model_benchmark.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5ec71ef8c114d35c4d41832221967298bc0c217f
--- /dev/null
+++ b/tools/test_model_benchmark.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+function check_whl {
+    bash -x paddle/scripts/paddle_build.sh build
+    [ $? -ne 0 ] && echo "build paddle failed." && exit 1
+    pip uninstall -y paddlepaddle_gpu
+    pip install build/python/dist/*.whl
+    [ $? -ne 0 ] && echo "install paddle failed." && exit 1
+
+    mkdir -p /tmp/pr && mkdir -p /tmp/develop
+    unzip -q build/python/dist/*.whl -d /tmp/pr
+
+    git checkout .
+    git checkout -b develop_base_pr upstream/$BRANCH
+    cd build
+    make -j `nproc`
+    unzip -q python/dist/*.whl -d /tmp/develop
+
+    sed -i '/version.py/d' /tmp/pr/*/RECORD
+    sed -i '/version.py/d' /tmp/develop/*/RECORD
+    diff_whl=`diff /tmp/pr/*/RECORD /tmp/develop/*/RECORD|wc -l`
+    if [ ${diff_whl} -eq 0 ];then
+        echo "paddle whl does not diff in PR-CI-Model-benchmark, so skip this ci"
+        exit 0
+    fi
+}
+
+function compile_install_paddle {
+    export CUDA_ARCH_NAME=Auto
+    export PY_VERSION=3.7
+    export WITH_DISTRIBUTE=OFF
+    export WITH_GPU=ON
+    export WITH_TENSORRT=OFF
+    export WITH_TESTING=OFF
+    export WITH_UNITY_BUILD=ON
+    check_whl
+}
+
+function prepare_data {
+    cd ${cache_dir}
+    if [ -d "benchmark_data" ];then 
+        echo -e "benchmark_data exist!"
+    else
+        mkdir benchmark_data
+        cd benchmark_data
+        mkdir dataset
+        cd dataset
+        wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/Bert.zip 
+        unzip Bert.zip
+        wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/imagenet100_data.zip
+        unzip imagenet100_data.zip
+    fi
+}
+
+function run_model_benchmark {
+    cd ${cache_dir}/benchmark_data
+    if [ -d "benchmark" ];then rm -rf benchmark
+    fi
+    git clone --recurse-submodules=PaddleClas --recurse-submodules=PaddleNLP https://github.com/paddlepaddle/benchmark.git
+    export data_path=${cache_dir}/benchmark_data/dataset
+    export BENCHMARK_ROOT=${cache_dir}/benchmark_data/benchmark
+    cd ${BENCHMARK_ROOT}/scripts/benchmark_ci
+    bash model_ci.sh
+}
+
+compile_install_paddle
+prepare_data
+run_model_benchmark
diff --git a/tools/test_op_benchmark.sh b/tools/test_op_benchmark.sh
index f0937ca7dfa2c5f220ca7d77a000b9b9d29899a9..a4c905196c2fe23e286c80006f040a167753583b 100644
--- a/tools/test_op_benchmark.sh
+++ b/tools/test_op_benchmark.sh
@@ -162,7 +162,6 @@ function compile_install_paddlepaddle {
   export BUILD_TYPE=Release
   export CUDA_ARCH_NAME=Auto
   export WITH_DISTRIBUTE=OFF
-  export PYTHON_ABI=cp37-cp37m
   export CMAKE_BUILD_TYPE=Release
   [ -d build ] && rm -rf build
   bash paddle/scripts/paddle_build.sh build $(nproc)
@@ -187,7 +186,7 @@ function run_op_benchmark_test {
   done
   # install tensorflow for testing accuary
   pip install tensorflow==2.3.0 tensorflow-probability
-  for branch_name in "develop" "test_pr"
+  for branch_name in "develop" "test"
   do
     git checkout $branch_name
     [ $? -ne 0 ] && LOG "[FATAL] Missing branch ${branch_name}." && exit 7
@@ -263,7 +262,7 @@ function summary_problems {
   done
   if [ $exit_code -ne 0 ]; then
     LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details."
-    LOG "[INFO] Or you can apply for one RD (GaoWei8(Recommend), Xreki, luotao1) approval to pass this PR."
+    LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR."
     exit $exit_code
   fi
 }
diff --git a/tools/test_print_signatures.py b/tools/test_print_signatures.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cbdbb56cb1b100ba8558cab2e5f8085361aa279
--- /dev/null
+++ b/tools/test_print_signatures.py
@@ -0,0 +1,95 @@
+#! /usr/bin/env python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TestCases for print_signatures.py
+
+sample lines from API_DEV.spec:
+    paddle.autograd.backward (ArgSpec(args=['tensors', 'grad_tensors', 'retain_graph'], varargs=None, keywords=None, defaults=(None, False)), ('document', '33a4434c9d123331499334fbe0274870'))
+    paddle.autograd.PyLayer (paddle.autograd.py_layer.PyLayer, ('document', 'c26adbbf5f1eb43d16d4a399242c979e'))
+    paddle.autograd.PyLayer.apply (ArgSpec(args=['cls'], varargs=args, keywords=kwargs, defaults=None), ('document', 'cb78696dc032fb8af2cba8504153154d'))
+"""
+import unittest
+import hashlib
+import inspect
+import functools
+from print_signatures import md5
+from print_signatures import get_functools_partial_spec
+from print_signatures import format_spec
+from print_signatures import queue_dict
+from print_signatures import member_dict
+
+
+def func_example(param_a, param_b):
+    """
+    example function
+    """
+    pass
+
+
+def func_example_2(func=functools.partial(func_example, 1)):
+    """
+    example function 2
+    """
+    pass
+
+
+class ClassExample():
+    """
+    example Class
+    """
+
+    def example_method(self):
+        """
+        class method
+        """
+        pass
+
+
+class Test_all_in_print_signatures(unittest.TestCase):
+    def test_md5(self):
+        algo = hashlib.md5()
+        algo.update(func_example.__doc__.encode('utf-8'))
+        digest = algo.hexdigest()
+        self.assertEqual(digest, md5(func_example.__doc__))
+
+    def test_get_functools_partial_spec(self):
+        partailed_func = functools.partial(func_example, 1)
+        # args = inspect.getargspec(partailed_func)
+        self.assertEqual('func_example(args=(1,), keywords={})',
+                         get_functools_partial_spec(partailed_func))
+
+
+class Test_format_spec(unittest.TestCase):
+    def test_normal_func_spec(self):
+        args = inspect.getargspec(func_example)
+        self.assertEqual(
+            '''ArgSpec(args=['param_a', 'param_b'], varargs=None, keywords=None, defaults=None)''',
+            format_spec(args))
+
+    def test_func_spec_with_partialedfunc_as_param_default(self):
+        # but there is no function belongs to this type in API_DEV.spec
+        args = inspect.getargspec(func_example_2)
+        self.assertEqual(
+            '''ArgSpec(args=['func'], varargs=None, keywords=None, defaults=('func_example(args=(1,), keywords={})',))''',
+            format_spec(args))
+
+
+class Test_queue_dict(unittest.TestCase):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..7836728247f50c50e90a4aa15fb78cd6f0a2efa8
--- /dev/null
+++ b/tools/test_sampcd_processor.py
@@ -0,0 +1,264 @@
+#! python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import tempfile
+import shutil
+import sys
+import importlib
+from sampcd_processor import find_all
+from sampcd_processor import check_indent
+from sampcd_processor import get_api_md5
+from sampcd_processor import get_incrementapi
+from sampcd_processor import get_wlist
+from sampcd_processor import sampcd_extract_to_file
+from sampcd_processor import execute_samplecode
+
+SAMPLECODE_TEMP_DIR = 'samplecode_temp'
+
+
+class Test_find_all(unittest.TestCase):
+    def test_find_none(self):
+        self.assertEqual(0, len(find_all('hello', 'world')))
+
+    def test_find_one(self):
+        self.assertListEqual([0], find_all('hello', 'hello'))
+
+    def test_find_two(self):
+        self.assertListEqual([1, 15],
+                             find_all(' hello, world; hello paddle!', 'hello'))
+
+
+class Test_check_indent(unittest.TestCase):
+    def test_no_indent(self):
+        self.assertEqual(0, check_indent('hello paddle'))
+
+    def test_indent_4_spaces(self):
+        self.assertEqual(4, check_indent('    hello paddle'))
+
+    def test_indent_1_tab(self):
+        self.assertEqual(4, check_indent("\thello paddle"))
+
+
+class Test_execute_samplecode(unittest.TestCase):
+    def setUp(self):
+        if not os.path.exists(SAMPLECODE_TEMP_DIR):
+            os.mkdir(SAMPLECODE_TEMP_DIR)
+        self.successSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
+                                                  'samplecode_success.py')
+        with open(self.successSampleCodeFile, 'w') as f:
+            f.write('print(1+1)')
+        self.failedSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
+                                                 'samplecode_failed.py')
+        with open(self.failedSampleCodeFile, 'w') as f:
+            f.write('print(1/0)')
+
+    def tearDown(self):
+        os.remove(self.successSampleCodeFile)
+        os.remove(self.failedSampleCodeFile)
+
+    def test_run_success(self):
+        result, tfname, msg = execute_samplecode(self.successSampleCodeFile)
+        self.assertTrue(result)
+        self.assertEqual(self.successSampleCodeFile, tfname)
+        self.assertIsNotNone(msg)
+        self.assertLess(msg.find('skipped'), 0)
+
+    def test_run_failed(self):
+        result, tfname, msg = execute_samplecode(self.failedSampleCodeFile)
+        self.assertFalse(result)
+        self.assertEqual(self.failedSampleCodeFile, tfname)
+        self.assertIsNotNone(msg)
+        self.assertLess(msg.find('skipped'), 0)
+
+    def test_testcases_skipped(self):
+        ...
+        tfname = os.path.join(SAMPLECODE_TEMP_DIR, 'samplecode_skipped.py')
+        with open(tfname, 'w') as f:
+            f.write("# required: distributed\nprint(1/0)")
+        result, _, msg = execute_samplecode(tfname)
+        self.assertTrue(result)
+        self.assertGreaterEqual(msg.find('skipped'), 0)
+        os.remove(tfname)
+
+
+class Test_sampcd_extract_to_file(unittest.TestCase):
+    def setUp(self):
+        if not os.path.exists(SAMPLECODE_TEMP_DIR):
+            os.mkdir(SAMPLECODE_TEMP_DIR)
+
+    def tearDown(self):
+        shutil.rmtree(SAMPLECODE_TEMP_DIR)
+
+    def test_1_samplecode(self):
+        comments = """
+        Examples:
+            .. code-block:: python
+
+                print(1+1)
+        """
+        funcname = 'one_plus_one'
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual(
+            [os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example.py')],
+            sample_code_filenames)
+
+    def test_no_samplecode(self):
+        comments = """
+        placeholder
+        """
+        funcname = 'one_plus_one'
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual([], sample_code_filenames)
+
+    def test_2_samplecodes(self):
+        comments = """
+        placeholder
+        Examples:
+            .. code-block:: python
+
+                print(1/0)
+
+            .. code-block:: python
+
+                print(1+1)
+        """
+        funcname = 'one_plus_one'
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual([
+            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_1.py'),
+            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_2.py')
+        ], sample_code_filenames)
+
+
+class Test_get_api_md5(unittest.TestCase):
+    def setUp(self):
+        self.api_pr_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
+        with open(self.api_pr_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """paddle.one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55one'))""",
+                """paddle.two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55two'))""",
+                """paddle.three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6cthree'))""",
+                """paddle.four_plus_four (paddle.four_plus_four, ('document', 'ff0f188c95030158cc6398d2a6c5four'))""",
+            ]))
+
+    def tearDown(self):
+        os.remove(self.api_pr_spec_filename)
+        pass
+
+    def test_get_api_md5(self):
+        res = get_api_md5('paddle/fluid/API_PR.spec')
+        self.assertEqual("ff0f188c95030158cc6398d2a6c55one",
+                         res['paddle.one_plus_one'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6c55two",
+                         res['paddle.two_plus_two'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6cthree",
+                         res['paddle.three_plus_three'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6c5four",
+                         res['paddle.four_plus_four'])
+
+
+class Test_get_incrementapi(unittest.TestCase):
+    def setUp(self):
+        self.api_pr_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
+        with open(self.api_pr_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """paddle.one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55one'))""",
+                """paddle.two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55two'))""",
+                """paddle.three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6cthree'))""",
+                """paddle.four_plus_four (paddle.four_plus_four, ('document', 'ff0f188c95030158cc6398d2a6c5four'))""",
+            ]))
+        self.api_dev_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_DEV.spec'))
+        with open(self.api_dev_spec_filename, 'w') as f:
+            f.write("\n".join([
+                """paddle.one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55one'))""",
+            ]))
+        self.api_diff_spec_filename = os.path.abspath(
+            os.path.join(os.getcwd(), "dev_pr_diff_api.spec"))
+
+    def tearDown(self):
+        os.remove(self.api_pr_spec_filename)
+        os.remove(self.api_dev_spec_filename)
+        os.remove(self.api_diff_spec_filename)
+
+    def test_it(self):
+        get_incrementapi()
+        with open(self.api_diff_spec_filename, 'r') as f:
+            lines = f.readlines()
+            self.assertCountEqual([
+                "paddle.two_plus_two\n", "paddle.three_plus_three\n",
+                "paddle.four_plus_four\n"
+            ], lines)
+
+
+class Test_get_wlist(unittest.TestCase):
+    def setUp(self):
+        self.tmpDir = tempfile.mkdtemp()
+        self.wlist_filename = os.path.join(self.tmpDir, 'wlist.json')
+        with open(self.wlist_filename, 'w') as f:
+            f.write(r'''
+{
+    "wlist_dir":[
+        {
+            "name":"../python/paddle/fluid/contrib",
+            "annotation":""
+        },
+        {
+            "name":"../python/paddle/verison.py",
+            "annotation":""
+        }
+    ],
+    "wlist_api":[
+        {
+            "name":"xxxxx",
+            "annotation":"not a real api, just for example"
+        }
+    ],
+    "wlist_temp_api":[
+        "to_tensor",
+        "save_persistables@dygraph/checkpoint.py"
+    ],
+    "gpu_not_white":[
+        "deformable_conv"
+    ]
+}
+''')
+
+    def tearDown(self):
+        os.remove(self.wlist_filename)
+        shutil.rmtree(self.tmpDir)
+
+    def test_get_wlist(self):
+        wlist, wlist_file, gpu_not_white = get_wlist(self.wlist_filename)
+        self.assertCountEqual(
+            ["xxxxx", "to_tensor",
+             "save_persistables@dygraph/checkpoint.py"], wlist)
+        self.assertCountEqual([
+            "../python/paddle/fluid/contrib",
+            "../python/paddle/verison.py",
+        ], wlist_file)
+        self.assertCountEqual(["deformable_conv"], gpu_not_white)
+
+
+# https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/ops.py
+# why? unabled to use the ast module. emmmmm
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/timeline.py b/tools/timeline.py
index 119018380b551cd10e419f0083774af5e4ff27ac..2a399b71b778634e820a1e3d5cedaa378616c22d 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -186,6 +186,13 @@ class Timeline(object):
                         self._chrome_trace.emit_pid(
                             "memory usage on %s:cudapinnedplace:%d" %
                             (k, mevent.device_id), pid)
+                elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
+                    if (k, mevent.device_id, "NPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "NPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:npu:%d" % (k, mevent.device_id),
+                            pid)
                 if (k, 0, "CPU") not in self._mem_devices:
                     pid = self._allocate_pid()
                     self._mem_devices[(k, 0, "CPU")] = pid
@@ -201,6 +208,11 @@ class Timeline(object):
                     self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
                     self._chrome_trace.emit_pid(
                         "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
+                if (k, 0, "NPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "NPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
+                                                (k, 0), pid)
 
     def _allocate_events(self):
         for k, profile_pb in six.iteritems(self._profile_dict):
@@ -227,7 +239,8 @@ class Timeline(object):
         place_to_str = {
             profiler_pb2.MemEvent.CPUPlace: "CPU",
             profiler_pb2.MemEvent.CUDAPlace: "GPU",
-            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
+            profiler_pb2.MemEvent.NPUPlace: "NPU"
         }
         for k, profile_pb in six.iteritems(self._profile_dict):
             mem_list = []
diff --git a/tools/timeout_debug_help.sh b/tools/timeout_debug_help.sh
new file mode 100644
index 0000000000000000000000000000000000000000..45de2db87e85302c5f281bf91538d480edcd3409
--- /dev/null
+++ b/tools/timeout_debug_help.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set +e
+failed_uts=$1
+need_debug_ut_re='test_dist_fleet'
+cat_log_judge=$(echo "${failed_uts}" | grep 'Timeout' |  grep -oEi "$need_debug_ut_re" )
+if [[ "$cat_log_judge" != "" ]];then 
+    echo "=============================================="
+    echo "show timeout ut logs"
+    echo "=============================================="
+    cat /tmp/tr0_err.log /tmp/tr1_err.log /tmp/ps0_err.log /tmp/ps1_err.log
+    cat /tmp/heter0_err.log /tmp/heter1_err.log
+fi
+set -e
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 736a19ddf52f46fc30d0bda3f6781a4b7bd7bdd1..4a61a99c34fa24186e51317e4e1432e64f953d8b 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -22,9 +22,9 @@
 ::   Include:
 ::     1. CMake 3.17.0
 ::     2. Git 2.28.0
-::     3. Python 3.7.8
-::     4. Visual Studio 2015 with update 3
-::     5. CUDA 10
+::     3. Python 3.8.3
+::     4. Visual Studio 2017 Community
+::     5. CUDA 11.2
 ::     6. java jre
 ::     7. xly agent
 
@@ -73,7 +73,6 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Install Cmake-3.17.0 failed, please re-install it manually.
 )
-del cmake-3.17.0-win64-x64.msi
 goto :eof
 :: ===== end step 1: cmake =====
 
@@ -99,91 +98,87 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Install Git-2.28.0 failed, please re-install it manually.
 )
-del Git-2.28.0-64-bit.exe
 goto :eof
 :: ===== end step 2: Git =====
 
 :: ===== start step 3: Python =====
-:: Download Python-3.7.8 and add in PATH when it not installed.
-:: TODO: limit version >= 3.7.8
+:: Download Python-3.8.3 and add in PATH when it not installed.
+:: TODO: limit version >= 3.8.3
 :python
-echo ">>>>>>>> step [3/7]: Python 3.7.8"
-python -V 2>&1 | findstr /C:"Python 3.7.8" > nul 2> nul || call :install_python
-goto vs2015
+echo ">>>>>>>> step [3/7]: Python 3.8.3"
+python -V 2>&1 | findstr /C:"Python 3.8.3" > nul 2> nul || call :install_python
+goto vs
 
 :install_python
-echo There is not Python in this PC, will install Python-3.7.8.
-echo Download package from https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe ...
-wget -O python-3.7.8-amd64.exe https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe
-echo Install Python-3.7.8 ...
+echo There is not Python in this PC, will install Python-3.8.3
+echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/python-3.8.3-amd64.exe ...
+wget -O python-3.8.3-amd64.exe https://paddle-ci.gz.bcebos.com/window_requirement/python-3.8.3-amd64.exe
+echo Install Python-3.8.3 ...
 :: /passive [silent install]
 :: InstallAllUsers [add path for all users]
 :: PrependPath [add script/install into PATH]
 :: TargetDir [install directory]
-start /wait python-3.7.8-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python37
+start /wait python-3.8.3-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python38
 if %errorlevel% == 0 (
-  echo Install python-3.7.8 success!
+  echo Install python-3.8.3 success!
 ) else (
-  echo Error***** Install python-3.7.8 failed, please re-install it manually.
+  echo Error***** Install python-3.8.3 failed, please re-install it manually.
 )
-del python-3.7.8-amd64.exe
 goto :eof
 :: ===== end step 3: Python =====
 
-:: ===== start step 4: Visual Studio 2015 =====
-:: Download Visual Studio 2015 when it not installed.
-:vs2015
-echo ">>>>>>>> step [4/7]: Visual Studio 2015"
-cmd /C "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 > nul 2> nul || call :install_visual_studio
+:: ===== start step 4: Visual Studio 2017 Community =====
+:: Download Visual Studio 2017 when it not installed.
+:vs
+echo ">>>>>>>> step [4/7]: Visual Studio 2017 "
+cmd /C "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"  > nul 2> nul || call :install_visual_studio
 goto :cuda10
 
 :install_visual_studio
-echo There is not Visual Studio in this PC, will install VS2015.
-echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
-wget -O vs_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
-echo Install Visual Studio 2015 ...
+echo There is not Visual Studio in this PC, will install VS2017.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/VS2017/vs_Community.exe"
+wget -O vs_Community.exe "https://paddle-ci.gz.bcebos.com/window_requirement/VS2017/vs_Community.exe"
+echo Install Visual Studio 2017 ...
 :: /passive [silent install]
 :: /norestart [no restart]
 :: /NoRefresh [no refresh]
 :: /InstallSelectableItems NativeLanguageSupport_Group [select Visual C++ for installing]
-start /wait vs_installer.exe /passive /norestart /NoRefresh /InstallSelectableItems NativeLanguageSupport_Group
+start /wait vs_Community.exe --passive --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.Universal --includeRecommended
 if %errorlevel% == 0 (
-  echo Install Visual Studio 2015 success!
+  echo Install Visual Studio 2017 success!
 ) else (
-  echo Error***** Install Visual Studio 2015 failed, please re-install it manually.
+  echo Error***** Install Visual Studio 2017 failed, please re-install it manually.
 )
-del vs_installer.exe
 goto :eof
-:: ===== end step 4: Visual Studio 2015 =====
+:: ===== end step 4: Visual Studio 2017 =====
 
-:: ===== start step 5: CUDA 10 =====
+:: ===== start step 5: CUDA 11 =====
 :cuda10
-echo ">>>>>>>> step [5/7]: CUDA 10.2"
-cmd /C nvcc --version 2> nul | findstr /C:"10.2" > nul 2> nul || call :install_cuda
+echo ">>>>>>>> step [5/7]: CUDA 11.2"
+cmd /C nvcc --version 2> nul | findstr /C:"11.2" > nul 2> nul || call :install_cuda
 goto java-jre
 
 :install_cuda
-echo There is not CUDA in this PC, will install CUDA-10.2.
-echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
-wget -O cuda_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
-echo Install CUDA-10.2 ...
+echo There is not CUDA in this PC, will install CUDA-11.2.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_11.2.0_460.89_win10.exe"
+wget -O cuda_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_11.2.0_460.89_win10.exe"
+echo Install CUDA-11.2 ...
 :: -s [silent install]
 start /wait cuda_installer.exe -s
 if %errorlevel% == 0 (
-  echo Install CUDA-10.2 success!
+  echo Install CUDA-11.2 success!
 ) else (
-  echo Error***** Install CUDA-10.2 failed, please re-install it manually.
+  echo Error***** Install CUDA-11.2 failed, please re-install it manually.
   goto :eof
 )
 del cuda_installer.exe
-echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-wget -O cudnn-10.2-windows10-x64-v7.6.5.32.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-tar xf cudnn-10.2-windows10-x64-v7.6.5.32.zip
-xcopy /E /Y /R "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin"
-xcopy /E /Y /R "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\include"
-xcopy /E /Y /R "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\lib"
+echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-11.2-windows-x64-v8.1.0.77.zip"
+wget -O cudnn-11.2-windows-x64-v8.1.0.77.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-11.2-windows-x64-v8.1.0.77.zip"
+tar xf cudnn-11.2-windows-x64-v8.1.0.77.zip
+xcopy /E /Y /R "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\bin"
+xcopy /E /Y /R "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include"
+xcopy /E /Y /R "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\lib"
 rd /s /q cuda
-del cudnn-10.2-windows10-x64-v7.6.5.32.zip
 goto :eof
 :: ===== end step 5: CUDA 10 =====
 
@@ -212,7 +207,7 @@ goto :eof
 :: ===== start step 7: xly agent =====
 :xly-agent
 echo ">>>>>>>> step [7/7]: xly agent"
-wget -O agent.jar "https://paddle-ci.gz.bcebos.com/window_requirement/agent.jar"
+wget -O agent.jar "https://xly.bce.baidu.com/sa_server/agent/v1/download?version=1.2.8"
 :: ===== end step 8: xly agent =====
 
 pause
diff --git a/tools/windows/get_prec_ut_list.py b/tools/windows/get_prec_ut_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b89b978e459a9d02d742292f9cc70a94328bf81
--- /dev/null
+++ b/tools/windows/get_prec_ut_list.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""To get a list of prec ut """
+
+import sys
+
+
+def get_prec_ut_list(all_test_cases, prec_test_cases):
+    """Select the ut that needs to be executed"""
+    all_test_cases_list = all_test_cases.strip().split("\n")
+    prec_test_cases_list = prec_test_cases.strip().split("\n")
+    all_test_cases_list_new = [item.rstrip() for item in all_test_cases_list]
+    prec_test_cases_list_new = [item.rstrip() for item in prec_test_cases_list]
+
+    if len(prec_test_cases) == 0:
+        return "\n".join(all_test_cases_list)
+
+    case_to_run = ['test_prec_ut']
+    for case in all_test_cases_list_new:
+        if case in prec_test_cases_list_new:
+            case_to_run.append(case)
+        else:
+            print("{} will not run in PRECISION_TEST mode.".format(case))
+    for case in case_to_run:
+        print(case)
+
+
+if __name__ == '__main__':
+    # get prec cases lists
+    with open('ut_list', 'r') as f:
+        prec_test_cases = f.read()
+    all_test_cases = sys.argv[1]
+    #prec_test_cases = sys.argv[2]
+    get_prec_ut_list(all_test_cases, prec_test_cases)
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 71b5e65214fba8a0b033745cd28dde8ce3ddc109..d2cefcc441f6c26f66ecb2fad4f570eb3b949d5c 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -16,6 +16,7 @@ set -e
 set +x
 NIGHTLY_MODE=$1
 PRECISION_TEST=$2
+WITH_GPU=$3
 
 export PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
 if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
@@ -36,7 +37,17 @@ else
     disable_ut_quickly=''
 fi
 
-# /*==================Fixed Disabled Windows unittests==============================*/
+# check added ut
+if [ ${WITH_GPU:-OFF} == "ON" ];then
+    set +e
+    cp $PADDLE_ROOT/tools/check_added_ut.sh $PADDLE_ROOT/tools/check_added_ut_win.sh
+    bash $PADDLE_ROOT/tools/check_added_ut_win.sh
+    rm -rf $PADDLE_ROOT/tools/check_added_ut_win.sh
+    set -e
+fi
+
+
+# /*==================Fixed Disabled Windows GPU MKL unittests==============================*/
 # TODO: fix these unittest that is bound to fail
 diable_wingpu_test="^lite_mul_model_test$|\
 ^test_analyzer_int8_resnet50$|\
@@ -107,16 +118,32 @@ diable_wingpu_test="^lite_mul_model_test$|\
 ^diable_wingpu_test$"
 # /*============================================================================*/
 
+# /*==================Fixed Disabled Windows CPU OPENBLAS unittests==============================*/
+# TODO: fix these unittest that is bound to fail
+diable_wincpu_test="^jit_kernel_test$|\
+^test_analyzer_transformer$|\
+^test_vision_models$|\
+^test_dygraph_multi_forward$|\
+^test_imperative_transformer_sorted_gradient$|\
+^test_program_prune_backward$|\
+^test_imperative_resnet$|\
+^test_imperative_resnet_sorted_gradient$|\
+^test_imperative_se_resnext$|\
+^test_imperative_static_runner_mnist$|\
+^test_bmn$|\
+^test_mobile_net$|\
+^test_resnet_v2$|\
+^test_se_resnet$|\
+^diable_wincpu_test$"
+
 # these unittest that cost long time, diabled temporarily, Maybe moved to the night
 long_time_test="^best_fit_allocator_test$|\
-^test_image_classification$|\
 ^decorator_test$|\
 ^test_dataset_cifar$|\
 ^test_dataset_imdb$|\
 ^test_dataset_movielens$|\
 ^test_datasets$|\
 ^test_pretrained_model$|\
-^test_concat_op$|\
 ^test_elementwise_add_op$|\
 ^test_elementwise_sub_op$|\
 ^test_gather_op$|\
@@ -132,8 +159,6 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_bicubic_interp_op$|\
 ^test_bicubic_interp_v2_op$|\
 ^test_bilinear_interp_v2_op$|\
-^test_conv2d_op$|\
-^test_conv3d_op$|
 ^test_conv3d_transpose_part2_op$|\
 ^test_conv_nn_grad$|\
 ^test_crop_tensor_op$|\
@@ -147,7 +172,6 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_empty_op$|\
 ^test_fused_elemwise_activation_op$|\
 ^test_group_norm_op$|\
-^test_gru_op$|\
 ^test_gru_unit_op$|\
 ^test_imperative_lod_tensor_to_selected_rows$|\
 ^test_imperative_optimizer$|\
@@ -195,57 +219,40 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_imperative_auto_mixed_precision$|\
 ^test_imperative_optimizer_v2$|\
 ^test_imperative_ptb_rnn_sorted_gradient$|\
-^test_imperative_save_load_v2$|\
-^test_nan_inf$|\
-^test_norm_op$|\
-^test_reduce_op$|\
 ^test_sigmoid_cross_entropy_with_logits_op$|\
-^test_stack_op$|\
-^test_strided_slice_op$|\
-^test_transpose_op$"
-
-export FLAGS_call_stack_level=2
-export FLAGS_fraction_of_gpu_memory_to_use=0.92
-export CUDA_VISIBLE_DEVICES=0
+^test_strided_slice_op$"
 
-UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
-num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l)
-echo "Windows 1 card TestCases count is $num"
-if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
-    python ${PADDLE_ROOT}/tools/get_pr_ut.py
-    if [[ -f "ut_list" ]]; then
-        set +x
-        echo "PREC length: "`wc -l ut_list`
-        precision_cases=`cat ut_list`
-        set -x
-    fi
-fi
+if [ ${WITH_GPU:-OFF} == "ON" ];then
+    export FLAGS_call_stack_level=2
+    export FLAGS_fraction_of_gpu_memory_to_use=0.92
+    export CUDA_VISIBLE_DEVICES=0
 
-if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
-    UT_list_prec=''
-    re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}')
-    for case in $UT_list; do
-        flag=$(echo $case|grep -oE $re)
-        if [ -n "$flag" ];then
-            if [ -z "$UT_list_prec" ];then
-                UT_list_prec=$case
-            else
-                UT_list_prec=$UT_list_prec'\n'$case
-            fi
-        else
-            echo $case "won't run in PRECISION_TEST mode."
+    UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
+    num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l)
+    echo "Windows 1 card TestCases count is $num"
+    if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+        python ${PADDLE_ROOT}/tools/get_pr_ut.py
+        if [[ -f "ut_list" ]]; then
+            echo "PREC length: "`wc -l ut_list`
+            precision_cases=`cat ut_list`
         fi
-    done
-    UT_list=$UT_list_prec
-fi
+    fi
 
-output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
-eight_parallel_job=$(echo $output | cut -d ";" -f 1)
-tetrad_parallel_jog=$(echo $output | cut -d ";" -f 2)
-non_parallel_job=$(echo $output | cut -d ";" -f 3)
+    set +e
+    if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
+        UT_list_res=$(python ${PADDLE_ROOT}/tools/windows/get_prec_ut_list.py "$UT_list" )
+        UT_list_prec=$(echo "${UT_list_res}" | grep -v 'PRECISION_TEST')
+        echo "${UT_list_res}" | grep 'PRECISION_TEST'
+        UT_list=$UT_list_prec
+    fi
+    set -e
 
-non_parallel_job_1=$(echo $non_parallel_job | cut -d "," -f 1)
-non_parallel_job_2=$(echo $non_parallel_job | cut -d "," -f 2)
+    output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
+    cpu_parallel_job=$(echo $output | cut -d ";" -f 1)
+    tetrad_parallel_job=$(echo $output | cut -d ";" -f 2)
+    two_parallel_job=$(echo $output | cut -d ";" -f 3)
+    non_parallel_job=$(echo $output | cut -d ";" -f 4)
+fi
 
 failed_test_lists=''
 tmp_dir=`mktemp -d`
@@ -265,13 +272,20 @@ function collect_failed_tests() {
     set -e
 }
 
-function run_unittest() {
+function run_unittest_cpu() {
+    tmpfile=$tmp_dir/$RANDOM
+    (ctest -E "$disable_ut_quickly|$diable_wincpu_test" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
+    wait;
+}
+
+function run_unittest_gpu() {
     test_case=$1
     parallel_job=$2
+    parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
     if [ "$2" == "" ]; then
-        parallel_job=1
+        parallel_job=$parallel_level_base
     else
-        parallel_job=$2
+        parallel_job=`expr $2 \* $parallel_level_base`
     fi
     echo "************************************************************************"
     echo "********These unittests run $parallel_job job each time with 1 GPU**********"
@@ -283,7 +297,11 @@ function run_unittest() {
 }
 
 function unittests_retry(){
-    parallel_job=1
+    if [ "${WITH_GPU:-OFF}" == "ON" ];then
+        parallel_job=1
+    else
+        parallel_job=4
+    fi
     is_retry_execuate=0
     wintest_error=1
     retry_time=3
@@ -303,6 +321,12 @@ function unittests_retry(){
                         cur_order='first'
                     elif ( [[ "$exec_times" == "1" ]] );then
                         cur_order='second'
+                        if [[ "$failed_test_lists" == "" ]]; then
+                            break
+                        else
+                            retry_unittests=$(echo "${failed_test_lists}" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+                            retry_unittests_regular=$(echo "$retry_unittests" |awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}')
+                        fi
                     elif ( [[ "$exec_times" == "2" ]] );then
                         cur_order='third'
                     fi
@@ -328,7 +352,7 @@ function unittests_retry(){
 
 function show_ut_retry_result() {
     if [[ "$is_retry_execuate" != "0" ]];then
-        failed_test_lists_ult=`echo "${failed_test_lists}" | grep -Po '[^ ].*$'`
+        failed_test_lists_ult=`echo "${failed_test_lists}"`
         echo "========================================="
         echo "There are more than 10 failed unit tests, so no unit test retry!!!"
         echo "========================================="
@@ -341,7 +365,7 @@ function show_ut_retry_result() {
             echo "========================================"
             echo "There are failed tests, which have been successful after re-run:"
             echo "========================================"
-            echo "The following tests have been re-ran:"
+            echo "The following tests have been re-run:"
             echo "${retry_unittests_record}"
         else
             failed_ut_re=$(echo "${retry_unittests_record_judge}" | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"|"$1}} END{print all_str}')
@@ -357,10 +381,25 @@ function show_ut_retry_result() {
 }
 
 set +e
-run_unittest $eight_parallel_job 8
-run_unittest $tetrad_parallel_jog 4
-run_unittest $non_parallel_job_1
-run_unittest $non_parallel_job_2
+
+if [ "${WITH_GPU:-OFF}" == "ON" ];then
+    if [ -f "$PADDLE_ROOT/added_ut" ];then
+        added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
+        ctest -R "(${added_uts})" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$?
+        if [ "$added_ut_error" != 0 ];then
+            echo "========================================"
+            echo "Added UT should pass three additional executions"
+            echo "========================================"
+            exit 8;
+        fi
+    fi
+    run_unittest_gpu $cpu_parallel_job 12
+    run_unittest_gpu $tetrad_parallel_job 4
+    run_unittest_gpu $two_parallel_job 2
+    run_unittest_gpu $non_parallel_job
+else
+    run_unittest_cpu
+fi
 collect_failed_tests
 set -e
 rm -f $tmp_dir/*
diff --git a/tools/wlist.json b/tools/wlist.json
index cd9f2a7ca661e02de42bf35bda4350e0bafa844e..5a83a9ee47004adcfbc41129a68b8734f71fd6cd 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -34,6 +34,10 @@
             "name":"reshape_",
             "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
         },
+        {
+            "name":"flatten_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
         {
             "name":"scatter_",
             "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
@@ -53,6 +57,50 @@
         {
             "name":"tanh_",
             "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"ceil_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"floor_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"exp_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"reciprocal_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"round_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"sqrt_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"rsqrt_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"clip_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"scale_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"subtract_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"add_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
         }
     ],
     "wlist_temp_api":[